microsoft_ngram 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +12 -3
- data/examples/segment.rb +2 -2
- data/examples/synthese.rb +60 -0
- data/examples/synthese_lib.rb +75 -0
- data/examples/veritable.rb +29 -0
- data/examples/veritable_lib.rb +121 -0
- data/lib/microsoft_ngram.rb +69 -6
- data/lib/microsoft_ngram/version.rb +1 -1
- data/microsoft_ngram.gemspec +1 -0
- metadata +100 -50
data/README.md
CHANGED
@@ -21,12 +21,12 @@ Usage
|
|
21
21
|
|
22
22
|
To get a list of currently available models:
|
23
23
|
|
24
|
-
|
25
|
-
=> ["bing-anchor/jun09/1", "bing-anchor/jun09/2", "bing-anchor/jun09/3", "bing-anchor/jun09/4", "bing-body/jun09/1", "bing-body/jun09/2", "bing-body/jun09/3", "bing-title/jun09/1", "bing-title/jun09/2", "bing-title/jun09/3", "bing-title/jun09/4", "bing-query/jun09/1", "bing-query/jun09/2", "bing-query/jun09/3"]
|
24
|
+
>> Bing::Ngram.models
|
25
|
+
=> ["bing-anchor/jun09/1", "bing-anchor/jun09/2", "bing-anchor/jun09/3", "bing-anchor/jun09/4", "bing-body/jun09/1", "bing-body/jun09/2", "bing-body/jun09/3", "bing-title/jun09/1", "bing-title/jun09/2", "bing-title/jun09/3", "bing-title/jun09/4", "bing-query/jun09/1", "bing-query/jun09/2", "bing-query/jun09/3", "bing-title/apr10/1", "bing-title/apr10/2", "bing-title/apr10/3", "bing-title/apr10/4", "bing-title/apr10/5", "bing-anchor/apr10/1", "bing-anchor/apr10/2", "bing-anchor/apr10/3", "bing-anchor/apr10/4", "bing-anchor/apr10/5", "bing-body/apr10/1", "bing-body/apr10/2", "bing-body/apr10/3", "bing-body/apr10/4", "bing-body/apr10/5"]
|
26
26
|
|
27
27
|
To see the default model:
|
28
28
|
|
29
|
-
> MicrosoftNgram.
|
29
|
+
> MicrosoftNgram.default_model
|
30
30
|
=> "bing-body/jun09/3"
|
31
31
|
|
32
32
|
Parameters to the initializer are:
|
@@ -73,6 +73,15 @@ To use the query model for the same thing:
|
|
73
73
|
vista -1.199022
|
74
74
|
installer -1.248958
|
75
75
|
|
76
|
+
You can also get a list of the N most likely candidates (could be slower for long lists):
|
77
|
+
|
78
|
+
> MicrosoftNgram.new(:model => 'bing-query/jun09/3').generate_list("Microsoft Windows",5).each {|x| puts x.join(' ')}
|
79
|
+
xp -0.5429792
|
80
|
+
</s> -1.062959
|
81
|
+
update -1.08291
|
82
|
+
vista -1.199022
|
83
|
+
installer -1.248958
|
84
|
+
|
76
85
|
Sample Script
|
77
86
|
-------------
|
78
87
|
|
data/examples/segment.rb
CHANGED
@@ -7,8 +7,8 @@ require 'memoize'
|
|
7
7
|
|
8
8
|
include Memoize
|
9
9
|
|
10
|
-
$bi_body_model = MicrosoftNgram.new(:model => "bing-body/
|
11
|
-
$uni_body_model = MicrosoftNgram.new(:model => "bing-body/
|
10
|
+
$bi_body_model = MicrosoftNgram.new(:model => "bing-body/apr10/2", :debug=>false)
|
11
|
+
$uni_body_model = MicrosoftNgram.new(:model => "bing-body/apr10/1", :debug=>false)
|
12
12
|
$magic_pr = -13.419954 # twice as uncommon as "kraig" last word in Bing 100k list
|
13
13
|
|
14
14
|
# Returns all the splits of a string up to a given length
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/synthese_lib.rb'
|
2
|
+
require 'rubygems'
|
3
|
+
require 'trollop'
|
4
|
+
|
5
|
+
def weighted_random_index l
|
6
|
+
tot = l.inject(0.0){|a,b| a+ b}
|
7
|
+
ns = l.map{|i| i/tot}
|
8
|
+
r = rand
|
9
|
+
c = 0.0
|
10
|
+
ns.each_with_index{|n,i| return i if r < (c+= n)}
|
11
|
+
l.size-1 # PRACTICE SAFETY
|
12
|
+
end
|
13
|
+
|
14
|
+
def random_punctuation
|
15
|
+
p = [".", "?", "!"]
|
16
|
+
f = [0.9, 0.05, 0.05]
|
17
|
+
p[weighted_random_index(f)]
|
18
|
+
end
|
19
|
+
|
20
|
+
def rand_btw(min,max)
|
21
|
+
rand(max-min)+min
|
22
|
+
end
|
23
|
+
|
24
|
+
def tcase(w)
|
25
|
+
return w.upcase if w.size < 2
|
26
|
+
w[0..0].upcase + w[1..-1]
|
27
|
+
end
|
28
|
+
|
29
|
+
def scribble(opts)
|
30
|
+
puts "\n== " + opts[:title].split(/ /).map{|w| tcase(w)}.join(" ") + " ==\n\n" if opts[:title] && opts[:title].size > 0
|
31
|
+
len = rand_btw(opts[:max_sentence_length],opts[:max_sentence_length])
|
32
|
+
wc = -1
|
33
|
+
s = opts[:s]
|
34
|
+
s.generate_n(opts[:max_wc],opts[:starter]){ |w|
|
35
|
+
((wc+=1) == 0) ? print(tcase(w)) : (((w=="<s>") || (w=="</s>")) ? print("#{w}\n") : print(w))
|
36
|
+
print(" ")
|
37
|
+
$stdout.flush
|
38
|
+
}
|
39
|
+
end
|
40
|
+
|
41
|
+
def main
|
42
|
+
opts = Trollop::options do
|
43
|
+
banner "Synthetic writer"
|
44
|
+
opt :title, "Title for story", :default => ""
|
45
|
+
opt :starter, "Starter text for story", :default => "It was the best of times, it was the worst of times"
|
46
|
+
opt :max_wc, "Max number of words to generate", :default => 1000
|
47
|
+
opt :model, "Corpus model to use", :default => Bing::Ngram.default_model
|
48
|
+
opt :depth, "How deep to look for the next token", :depth => 100
|
49
|
+
opt :max_sentence_length, "Maximum length of sentences to generate", :default => 16
|
50
|
+
opt :min_sentence_length, "Minimum length of sentences to generate", :default => 8
|
51
|
+
opt :debug, "Debug errors", :default=> true
|
52
|
+
end
|
53
|
+
|
54
|
+
opts[:s] = Synthese.new(opts)
|
55
|
+
trap("INT","EXIT")
|
56
|
+
scribble(opts)
|
57
|
+
end
|
58
|
+
|
59
|
+
main
|
60
|
+
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../lib/microsoft_ngram'
|
2
|
+
require 'rubygems'
|
3
|
+
|
4
|
+
class BackoffGenerator
|
5
|
+
attr_accessor :model_type, :models, :debug
|
6
|
+
|
7
|
+
def initialize(model_type="body",max_models=5,debug=false)
|
8
|
+
@model_type = model_type
|
9
|
+
@debug=debug
|
10
|
+
@models = Bing::Ngram::models_find_all(model_type).take(max_models).map{|m|Bing::Ngram.new(:model => m, :debug=>debug)}
|
11
|
+
end
|
12
|
+
|
13
|
+
def generate_list(text, n, initial_text)
|
14
|
+
@models.each do |model|
|
15
|
+
l = model.generate_list(text,n)
|
16
|
+
return l if l.size > 0
|
17
|
+
end
|
18
|
+
if (text != initial_text)
|
19
|
+
generate_list(initial_text, n, initial_text)
|
20
|
+
else
|
21
|
+
[]
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
class Synthese
|
28
|
+
attr_accessor :generator, :depth, :n, :debug
|
29
|
+
|
30
|
+
def initialize(args = {})
|
31
|
+
#puts args.inspect
|
32
|
+
mtype = args[:model_type] || "body"
|
33
|
+
@debug = args[:debug] || false
|
34
|
+
@generator = BackoffGenerator.new(mtype,5,false)
|
35
|
+
@depth = (args[:depth] ? args[:depth].to_i : 100)
|
36
|
+
@n = Bing::ModelSpec.new(@generator.models[0].model).size
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.weighted_random_index l
|
40
|
+
tot = l.inject(0.0){|a,b| a+ b}
|
41
|
+
ns = l.map{|i| i/tot}
|
42
|
+
r = rand
|
43
|
+
c = 0.0
|
44
|
+
ns.each_with_index{|n,i| return i if r < (c+= n)}
|
45
|
+
l.size-1 # PRACTICE SAFETY
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.random_word l
|
49
|
+
l[Synthese.weighted_random_index(l.map{|w,lp| Math.exp(lp)})][0]
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
def generate(text)
|
54
|
+
tokens = text.split(/\s/)
|
55
|
+
tokens.each{|token| yield token}
|
56
|
+
tokens = tokens[-@n..-1] if tokens.size > @n
|
57
|
+
initial_text = text
|
58
|
+
text = tokens.join(' ')
|
59
|
+
while true
|
60
|
+
#puts "OUTER. Generating from : #{text}"
|
61
|
+
l = @generator.generate_list(text,@depth,initial_text)
|
62
|
+
break if l.size == 0
|
63
|
+
w = Synthese.random_word(l)
|
64
|
+
yield w
|
65
|
+
tokens << w
|
66
|
+
tokens = tokens[-@n..-1] if tokens.size > @n
|
67
|
+
text = tokens.join(' ')
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def generate_n(n,text)
|
72
|
+
counter = 0
|
73
|
+
generate(text){|w| yield w; counter += 1; break if counter > n }
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/veritable_lib.rb'
|
2
|
+
require 'rubygems'
|
3
|
+
require 'trollop'
|
4
|
+
|
5
|
+
def main
|
6
|
+
opts = Trollop::options do
|
7
|
+
banner "Generate phrases from Bing's Ngram server based on patterns— specify pattern on command line as alternating strings and ints"
|
8
|
+
opt :max_wild_children, "Max number of wildcard tokens to generate", :default => 20
|
9
|
+
opt :max_literal_children, "Max number of tokens to generate after literal strings", :default => 100
|
10
|
+
opt :model, "Corpus model to use", :default => Bing::Ngram.default_model
|
11
|
+
opt :max_length, "Maximum length of phrases to generate", :default => 5
|
12
|
+
opt :verbose, "Send logging messages to $STDERR", :default => false
|
13
|
+
end
|
14
|
+
|
15
|
+
pattern = []
|
16
|
+
ARGV.each_with_index{|item, i| pattern << ((i % 2 == 0) ? item : [item.to_i])}
|
17
|
+
if opts[:verbose]
|
18
|
+
$stderr.puts opts.inspect
|
19
|
+
$stderr.puts pattern.inspect
|
20
|
+
end
|
21
|
+
R.new(pattern, opts).generate{|t| $stdout.puts(t.join("\t"))}
|
22
|
+
end
|
23
|
+
|
24
|
+
trap("INT","EXIT")
|
25
|
+
main
|
26
|
+
|
27
|
+
# examples
|
28
|
+
# veritable.rb "veritable" 2 "of" 2
|
29
|
+
# veritable.rb --max-literal-children 50 "a taste of" 2
|
@@ -0,0 +1,121 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../lib/microsoft_ngram'
|
2
|
+
|
3
|
+
# S comtains the state we use to move query the model.
|
4
|
+
#
|
5
|
+
class S
|
6
|
+
attr_accessor :prefix, :pattern, :max_wild_children, :max_literal_children, :max_length, :cp, :model
|
7
|
+
def initialize(prefix, pattern, max_wild_children, max_literal_children, max_length, cp, model)
|
8
|
+
@prefix = prefix
|
9
|
+
@pattern = pattern
|
10
|
+
@max_wild_children = max_wild_children
|
11
|
+
@max_literal_children = max_literal_children
|
12
|
+
@cp = cp
|
13
|
+
@model = model
|
14
|
+
@max_length = max_length
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_s
|
18
|
+
"S(#{@prefix.inspect}:#{@pattern.inspect}:#{cp})"
|
19
|
+
end
|
20
|
+
|
21
|
+
def children
|
22
|
+
return [] if @pattern.size == 0
|
23
|
+
return [] if @prefix.split.size >= @max_length
|
24
|
+
hd = @pattern.first
|
25
|
+
tail = @pattern[1..-1]
|
26
|
+
# puts "pattern: #{@pattern.inspect}; hd: #{hd.inspect}; tail: #{tail.inspect}"
|
27
|
+
if hd.class == String
|
28
|
+
string_children(hd, tail)
|
29
|
+
else
|
30
|
+
wildcard_children(hd, tail)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def string_children(hd, tail)
|
35
|
+
query = @prefix.size > 0 ? "#{@prefix} #{@pattern[0]}" : @pattern[0]
|
36
|
+
pairs = @model.generate_list(query,@max_literal_children)
|
37
|
+
pairs.map{|w,cp| S.new("#{query} #{w}", tail, @max_wild_children, @max_literal_children, @max_length, cp, @model)}
|
38
|
+
end
|
39
|
+
|
40
|
+
def wildcard_children(hd, tail)
|
41
|
+
n = @pattern[0][0]
|
42
|
+
return [] if n<1 # the 'one' was generated by the string children or the previous iteration
|
43
|
+
pairs = @model.generate_list(@prefix,@max_wild_children)
|
44
|
+
pairs.map{|w,cp| S.new("#{@prefix} #{w}",([[n-1]] + tail),@max_wild_children, @max_literal_children, @max_length, cp, @model)}
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# R is the machine that generates most likely phrases with wildcards
|
49
|
+
# it is intialized with
|
50
|
+
# pattern = an array of alternating string and size requirements specified as [i]
|
51
|
+
# eg ["veritable", [2], "of" [2]] -> look for 'veritable', then up to 2 tokens then 'of', then up to two tokens
|
52
|
+
# it has to be at least [1]
|
53
|
+
# max_wild_children : for wildcards (size requirement), the maximum number to inspect at this point
|
54
|
+
# max_literal_children: for strings, the max number of tokens to inspect
|
55
|
+
# model: the corpus in bing format, eg "bing-body/apr10/5". Leave nil to use the 'best' (latest, longest, body)
|
56
|
+
# verbose: if true, a few messages printed to $stderr
|
57
|
+
class R
|
58
|
+
attr_accessor :pattern, :regex, :max_wild_children, :max_literal_children, :max_length, :model, :verbose
|
59
|
+
|
60
|
+
def initialize(pattern, args = {})
|
61
|
+
@pattern = pattern
|
62
|
+
raise "Pattern must be array" unless @pattern.class == Array
|
63
|
+
raise "First item #{@pattern[0].inspect} must be a string" unless @pattern[0].class == String
|
64
|
+
unless pattern.all?{|el| el.class==String || (el.class==Array && el[0].class==Fixnum)}
|
65
|
+
raise "Pattern items must be strings or arrays of numbers"
|
66
|
+
end
|
67
|
+
@model = !args[:model] ? Bing::Ngram.new : (args[:model].class==String ? Bing::Ngram.new(args[:model]) : args[:model])
|
68
|
+
@max_wild_children = args[:max_wild_children] || 20
|
69
|
+
@max_literal_children = args[:max_literal_children] || 100
|
70
|
+
@max_length = args[:max_length] || Bing::CorpusSpec.new(@model.model).size
|
71
|
+
@verbose = args[:verbose] || false
|
72
|
+
@regex = pattern_to_regex(pattern)
|
73
|
+
end
|
74
|
+
|
75
|
+
# converts the pattern to a reg expression
|
76
|
+
def pattern_to_regex(pattern)
|
77
|
+
def array_to_regex(n)
|
78
|
+
case n
|
79
|
+
when 1
|
80
|
+
"\\S+"
|
81
|
+
else
|
82
|
+
"(?:\\S+ ){0,#{n-1}}\\S+"
|
83
|
+
end
|
84
|
+
end
|
85
|
+
str = pattern.map do |el|
|
86
|
+
el.class==String ? el : array_to_regex(el[0])
|
87
|
+
end.join(" ")
|
88
|
+
Regexp.new(str)
|
89
|
+
end
|
90
|
+
|
91
|
+
# does the generated phrase match?
|
92
|
+
def matches(str)
|
93
|
+
(@regex =~ str) && true
|
94
|
+
end
|
95
|
+
|
96
|
+
# generate -- yields tuples of prefix, cp, and jp
|
97
|
+
def generate
|
98
|
+
stack = []
|
99
|
+
s = S.new("",@pattern, @max_wild_children, @max_literal_children, @max_length, 0.0, @model)
|
100
|
+
$stderr.puts "Generate:Initializing.. #{s.inspect}." if @verbose
|
101
|
+
stack.push(s)
|
102
|
+
until stack.empty?
|
103
|
+
current = stack.pop
|
104
|
+
$stderr.puts("Current state: #{current}; stack size: #{stack.size}") if @verbose
|
105
|
+
if matches(current.prefix)
|
106
|
+
jp = @model.jp(current.prefix)
|
107
|
+
yield [current.prefix, current.cp, jp]
|
108
|
+
else
|
109
|
+
$stderr.puts("Not a match: #{current.prefix.inspect}") if @verbose
|
110
|
+
end
|
111
|
+
current.children.each{|child| stack.push(child); $stderr.puts "Added child: #{child}" if @verbse}
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
|
117
|
+
|
118
|
+
|
119
|
+
# examples
|
120
|
+
# #x = R.new([ "veritable", [2], "of", [2] ], 20, 100, 5)
|
121
|
+
|
data/lib/microsoft_ngram.rb
CHANGED
@@ -1,8 +1,41 @@
|
|
1
|
-
require
|
1
|
+
require 'rubygems'
|
2
2
|
require "rest-client"
|
3
3
|
|
4
4
|
module Bing
|
5
|
-
|
5
|
+
|
6
|
+
# this class is only used to find the best default model,
|
7
|
+
# that is, for the default_model call
|
8
|
+
class ModelSpec
|
9
|
+
|
10
|
+
attr_accessor :model_type, :date, :size
|
11
|
+
|
12
|
+
def initialize (spec)
|
13
|
+
def parse_month(m)
|
14
|
+
case m
|
15
|
+
when 'jan': '01'
|
16
|
+
when 'feb': '02'
|
17
|
+
when 'mar': '03'
|
18
|
+
when 'apr': '04'
|
19
|
+
when 'may': '05'
|
20
|
+
when 'jun': '06'
|
21
|
+
when 'jul': '07'
|
22
|
+
when 'aug': '08'
|
23
|
+
when 'sep': '09'
|
24
|
+
when 'oct': '10'
|
25
|
+
when 'nov': '11'
|
26
|
+
when 'dec': '12'
|
27
|
+
else '??'
|
28
|
+
end
|
29
|
+
end
|
30
|
+
parts = spec.split('/')
|
31
|
+
@model_type = parts[0].split('-')[1] # 'bing-body'
|
32
|
+
yr = parts[1].split(/\D/)[-1].to_i + 2000
|
33
|
+
month = parse_month(parts[1].split(/\d/)[0])
|
34
|
+
@date = "#{yr}-#{month}"
|
35
|
+
@size = parts[2].to_i
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
6
39
|
class Ngram
|
7
40
|
|
8
41
|
@@endpoint = "http://web-ngram.research.microsoft.com/rest/lookup.svc/"
|
@@ -17,6 +50,26 @@ module Bing
|
|
17
50
|
@@models.include?(model)
|
18
51
|
end
|
19
52
|
|
53
|
+
def self.default_model(model_type='body') #most recent, longest
|
54
|
+
Bing::Ngram.models() if @@models == nil # cache the current models
|
55
|
+
ms = @@models.
|
56
|
+
map{|x| [ModelSpec.new(x),x]}.
|
57
|
+
find_all{|c| c[0].model_type == model_type}.
|
58
|
+
sort_by{|c| [c[0].date, c[0].size]}
|
59
|
+
@@default_model = ms.size > 0 ? ms[-1][1] : nil
|
60
|
+
end
|
61
|
+
|
62
|
+
def self.models_find_all(model_type='body',min_size=1)
|
63
|
+
Bing::Ngram.models() if @@models == nil # cache the current models
|
64
|
+
ms = @@models.
|
65
|
+
map{|x| [ModelSpec.new(x),x]}.
|
66
|
+
find_all{|c| c[0].model_type == model_type}.
|
67
|
+
find_all{|c| c[0].size >= min_size}.
|
68
|
+
sort_by{|c| [c[0].date, c[0].size]}.
|
69
|
+
map{|spec,m| m}.
|
70
|
+
reverse
|
71
|
+
end
|
72
|
+
|
20
73
|
attr_accessor :user_token
|
21
74
|
# The model is the current model. Query this.models() for available models
|
22
75
|
attr_accessor :model
|
@@ -24,25 +77,28 @@ module Bing
|
|
24
77
|
attr_accessor :debug
|
25
78
|
# Ngram size based on model
|
26
79
|
attr_accessor :ngram_size
|
27
|
-
|
80
|
+
|
28
81
|
def initialize(args = {})
|
29
82
|
@user_token = args["user_token"] || args[:user_token] || ENV["NGRAM_TOKEN"]
|
30
83
|
unless @user_token
|
31
84
|
raise "Must provide user token as NGRAM_TOKEN env variable or as :user_token => token. To get a token, see http://web-ngram.research.microsoft.com/info/ "
|
32
85
|
end
|
33
86
|
# probably shouldn't change
|
34
|
-
@model = args["model"] || args[:model] || Bing::Ngram.
|
87
|
+
@model = args["model"] || args[:model] || Bing::Ngram.default_model()
|
35
88
|
unless Bing::Ngram.defined_model?(@model)
|
36
89
|
raise "Invalid model: #{@model}. Valid models are #{@@models.join('; ')}"
|
37
90
|
end
|
38
91
|
@debug = (args["debug"] || args[:debug] || nil)
|
92
|
+
#puts "Creating #{@model.inspect} with debug=#{@debug}"
|
39
93
|
@ngram_size = @model.split(/\//)[-1].to_i
|
40
94
|
end
|
41
95
|
|
96
|
+
|
97
|
+
|
42
98
|
def get(op,phrase,args)
|
43
99
|
model = args["model"] || args[:model] || @model
|
44
100
|
RestClient.get(@@endpoint + model + '/' + op, {:params => {:u => @user_token, :p => phrase}.merge(args)}) do |res,req,result|
|
45
|
-
$stderr.puts
|
101
|
+
$stderr.puts res.inspect if @debug
|
46
102
|
res
|
47
103
|
end
|
48
104
|
end
|
@@ -50,7 +106,7 @@ module Bing
|
|
50
106
|
def post(op,phrases,args)
|
51
107
|
model = args["model"] || args[:model] || @model
|
52
108
|
RestClient.post(@@endpoint + model + '/' + op + "?u=#{@user_token}", phrases.join("\n")) do |res,req,result|
|
53
|
-
$stderr.puts
|
109
|
+
$stderr.puts res.inspect if @debug
|
54
110
|
res
|
55
111
|
end
|
56
112
|
end
|
@@ -89,6 +145,13 @@ module Bing
|
|
89
145
|
end
|
90
146
|
end
|
91
147
|
end
|
148
|
+
|
149
|
+
# get a list of the next most popular tokens with log-freq
|
150
|
+
def generate_list(phrase, max_length)
|
151
|
+
l = []
|
152
|
+
generate(phrase,max_length){|p| l << p}
|
153
|
+
l
|
154
|
+
end
|
92
155
|
|
93
156
|
# spell-checking
|
94
157
|
# Bing::Ngram.new(:debug=>nil,:model=>'bing-body/jun09/1').jps(edits1("appresiate").uniq).sort{|a,b| b[1] <=> a[1]}[0..30]
|
data/microsoft_ngram.gemspec
CHANGED
metadata
CHANGED
@@ -1,70 +1,106 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: microsoft_ngram
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 23
|
5
5
|
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 4
|
10
|
+
version: 0.0.4
|
6
11
|
platform: ruby
|
7
|
-
authors:
|
12
|
+
authors:
|
8
13
|
- Will Fitzgerald
|
9
14
|
- Zeke Sikelianos
|
10
15
|
autorequire:
|
11
16
|
bindir: bin
|
12
17
|
cert_chain: []
|
13
|
-
|
14
|
-
|
15
|
-
|
18
|
+
|
19
|
+
date: 2012-02-09 00:00:00 Z
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
16
22
|
name: hoe
|
17
|
-
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
18
25
|
none: false
|
19
|
-
requirements:
|
20
|
-
- -
|
21
|
-
- !ruby/object:Gem::Version
|
22
|
-
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
30
|
+
segments:
|
31
|
+
- 0
|
32
|
+
version: "0"
|
23
33
|
type: :development
|
24
|
-
|
25
|
-
|
26
|
-
- !ruby/object:Gem::Dependency
|
34
|
+
version_requirements: *id001
|
35
|
+
- !ruby/object:Gem::Dependency
|
27
36
|
name: rspec
|
28
|
-
|
37
|
+
prerelease: false
|
38
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
29
39
|
none: false
|
30
|
-
requirements:
|
40
|
+
requirements:
|
31
41
|
- - ~>
|
32
|
-
- !ruby/object:Gem::Version
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
hash: 47
|
44
|
+
segments:
|
45
|
+
- 2
|
46
|
+
- 8
|
47
|
+
- 0
|
33
48
|
version: 2.8.0
|
34
49
|
type: :development
|
35
|
-
|
36
|
-
|
37
|
-
- !ruby/object:Gem::Dependency
|
50
|
+
version_requirements: *id002
|
51
|
+
- !ruby/object:Gem::Dependency
|
38
52
|
name: autotest
|
39
|
-
|
53
|
+
prerelease: false
|
54
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
40
55
|
none: false
|
41
|
-
requirements:
|
42
|
-
- -
|
43
|
-
- !ruby/object:Gem::Version
|
44
|
-
|
56
|
+
requirements:
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
hash: 3
|
60
|
+
segments:
|
61
|
+
- 0
|
62
|
+
version: "0"
|
45
63
|
type: :development
|
46
|
-
|
47
|
-
|
48
|
-
- !ruby/object:Gem::Dependency
|
64
|
+
version_requirements: *id003
|
65
|
+
- !ruby/object:Gem::Dependency
|
49
66
|
name: rest-client
|
50
|
-
|
67
|
+
prerelease: false
|
68
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
51
69
|
none: false
|
52
|
-
requirements:
|
53
|
-
- -
|
54
|
-
- !ruby/object:Gem::Version
|
55
|
-
|
70
|
+
requirements:
|
71
|
+
- - ">="
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
hash: 3
|
74
|
+
segments:
|
75
|
+
- 0
|
76
|
+
version: "0"
|
56
77
|
type: :runtime
|
78
|
+
version_requirements: *id004
|
79
|
+
- !ruby/object:Gem::Dependency
|
80
|
+
name: trollop
|
57
81
|
prerelease: false
|
58
|
-
|
82
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
83
|
+
none: false
|
84
|
+
requirements:
|
85
|
+
- - ">="
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
hash: 3
|
88
|
+
segments:
|
89
|
+
- 0
|
90
|
+
version: "0"
|
91
|
+
type: :runtime
|
92
|
+
version_requirements: *id005
|
59
93
|
description: A simple wrapper for Bing's ngram API
|
60
|
-
email:
|
94
|
+
email:
|
61
95
|
- will@wordnik.com
|
62
96
|
- zeke@sikelianos.com
|
63
|
-
executables:
|
97
|
+
executables:
|
64
98
|
- microsoft_ngram
|
65
99
|
extensions: []
|
100
|
+
|
66
101
|
extra_rdoc_files: []
|
67
|
-
|
102
|
+
|
103
|
+
files:
|
68
104
|
- .gitignore
|
69
105
|
- .rspec
|
70
106
|
- Gemfile
|
@@ -74,6 +110,10 @@ files:
|
|
74
110
|
- Rakefile
|
75
111
|
- bin/microsoft_ngram
|
76
112
|
- examples/segment.rb
|
113
|
+
- examples/synthese.rb
|
114
|
+
- examples/synthese_lib.rb
|
115
|
+
- examples/veritable.rb
|
116
|
+
- examples/veritable_lib.rb
|
77
117
|
- lib/microsoft_ngram.rb
|
78
118
|
- lib/microsoft_ngram/version.rb
|
79
119
|
- microsoft_ngram.gemspec
|
@@ -81,26 +121,36 @@ files:
|
|
81
121
|
- spec/spec_helper.rb
|
82
122
|
homepage: http://developer.wordnik.com
|
83
123
|
licenses: []
|
124
|
+
|
84
125
|
post_install_message:
|
85
126
|
rdoc_options: []
|
86
|
-
|
127
|
+
|
128
|
+
require_paths:
|
87
129
|
- lib
|
88
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
130
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
89
131
|
none: false
|
90
|
-
requirements:
|
91
|
-
- -
|
92
|
-
- !ruby/object:Gem::Version
|
93
|
-
|
94
|
-
|
132
|
+
requirements:
|
133
|
+
- - ">="
|
134
|
+
- !ruby/object:Gem::Version
|
135
|
+
hash: 3
|
136
|
+
segments:
|
137
|
+
- 0
|
138
|
+
version: "0"
|
139
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
95
140
|
none: false
|
96
|
-
requirements:
|
97
|
-
- -
|
98
|
-
- !ruby/object:Gem::Version
|
99
|
-
|
141
|
+
requirements:
|
142
|
+
- - ">="
|
143
|
+
- !ruby/object:Gem::Version
|
144
|
+
hash: 3
|
145
|
+
segments:
|
146
|
+
- 0
|
147
|
+
version: "0"
|
100
148
|
requirements: []
|
149
|
+
|
101
150
|
rubyforge_project: microsoft_ngram
|
102
|
-
rubygems_version: 1.8.
|
151
|
+
rubygems_version: 1.8.13
|
103
152
|
signing_key:
|
104
153
|
specification_version: 3
|
105
154
|
summary: A simple wrapper for Bing's ngram API
|
106
155
|
test_files: []
|
156
|
+
|