microsoft_ngram 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +12 -3
- data/examples/segment.rb +2 -2
- data/examples/synthese.rb +60 -0
- data/examples/synthese_lib.rb +75 -0
- data/examples/veritable.rb +29 -0
- data/examples/veritable_lib.rb +121 -0
- data/lib/microsoft_ngram.rb +69 -6
- data/lib/microsoft_ngram/version.rb +1 -1
- data/microsoft_ngram.gemspec +1 -0
- metadata +100 -50
data/README.md
CHANGED
@@ -21,12 +21,12 @@ Usage
|
|
21
21
|
|
22
22
|
To get a list of currently available models:
|
23
23
|
|
24
|
-
|
25
|
-
=> ["bing-anchor/jun09/1", "bing-anchor/jun09/2", "bing-anchor/jun09/3", "bing-anchor/jun09/4", "bing-body/jun09/1", "bing-body/jun09/2", "bing-body/jun09/3", "bing-title/jun09/1", "bing-title/jun09/2", "bing-title/jun09/3", "bing-title/jun09/4", "bing-query/jun09/1", "bing-query/jun09/2", "bing-query/jun09/3"]
|
24
|
+
>> Bing::Ngram.models
|
25
|
+
=> ["bing-anchor/jun09/1", "bing-anchor/jun09/2", "bing-anchor/jun09/3", "bing-anchor/jun09/4", "bing-body/jun09/1", "bing-body/jun09/2", "bing-body/jun09/3", "bing-title/jun09/1", "bing-title/jun09/2", "bing-title/jun09/3", "bing-title/jun09/4", "bing-query/jun09/1", "bing-query/jun09/2", "bing-query/jun09/3", "bing-title/apr10/1", "bing-title/apr10/2", "bing-title/apr10/3", "bing-title/apr10/4", "bing-title/apr10/5", "bing-anchor/apr10/1", "bing-anchor/apr10/2", "bing-anchor/apr10/3", "bing-anchor/apr10/4", "bing-anchor/apr10/5", "bing-body/apr10/1", "bing-body/apr10/2", "bing-body/apr10/3", "bing-body/apr10/4", "bing-body/apr10/5"]
|
26
26
|
|
27
27
|
To see the default model:
|
28
28
|
|
29
|
-
> MicrosoftNgram.
|
29
|
+
> MicrosoftNgram.default_model
|
30
30
|
=> "bing-body/jun09/3"
|
31
31
|
|
32
32
|
Parameters to the initializer are:
|
@@ -73,6 +73,15 @@ To use the query model for the same thing:
|
|
73
73
|
vista -1.199022
|
74
74
|
installer -1.248958
|
75
75
|
|
76
|
+
You can also get a list of the N most likely candidates (could be slower for long lists):
|
77
|
+
|
78
|
+
> MicrosoftNgram.new(:model => 'bing-query/jun09/3').generate_list("Microsoft Windows",5).each {|x| puts x.join(' ')}
|
79
|
+
xp -0.5429792
|
80
|
+
</s> -1.062959
|
81
|
+
update -1.08291
|
82
|
+
vista -1.199022
|
83
|
+
installer -1.248958
|
84
|
+
|
76
85
|
Sample Script
|
77
86
|
-------------
|
78
87
|
|
data/examples/segment.rb
CHANGED
@@ -7,8 +7,8 @@ require 'memoize'
|
|
7
7
|
|
8
8
|
include Memoize
|
9
9
|
|
10
|
-
$bi_body_model = MicrosoftNgram.new(:model => "bing-body/
|
11
|
-
$uni_body_model = MicrosoftNgram.new(:model => "bing-body/
|
10
|
+
$bi_body_model = MicrosoftNgram.new(:model => "bing-body/apr10/2", :debug=>false)
|
11
|
+
$uni_body_model = MicrosoftNgram.new(:model => "bing-body/apr10/1", :debug=>false)
|
12
12
|
$magic_pr = -13.419954 # twice as uncommon as "kraig" last word in Bing 100k list
|
13
13
|
|
14
14
|
# Returns all the splits of a string up to a given length
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/synthese_lib.rb'
|
2
|
+
require 'rubygems'
|
3
|
+
require 'trollop'
|
4
|
+
|
5
|
+
def weighted_random_index l
|
6
|
+
tot = l.inject(0.0){|a,b| a+ b}
|
7
|
+
ns = l.map{|i| i/tot}
|
8
|
+
r = rand
|
9
|
+
c = 0.0
|
10
|
+
ns.each_with_index{|n,i| return i if r < (c+= n)}
|
11
|
+
l.size-1 # PRACTICE SAFETY
|
12
|
+
end
|
13
|
+
|
14
|
+
def random_punctuation
|
15
|
+
p = [".", "?", "!"]
|
16
|
+
f = [0.9, 0.05, 0.05]
|
17
|
+
p[weighted_random_index(f)]
|
18
|
+
end
|
19
|
+
|
20
|
+
def rand_btw(min,max)
|
21
|
+
rand(max-min)+min
|
22
|
+
end
|
23
|
+
|
24
|
+
def tcase(w)
|
25
|
+
return w.upcase if w.size < 2
|
26
|
+
w[0..0].upcase + w[1..-1]
|
27
|
+
end
|
28
|
+
|
29
|
+
def scribble(opts)
|
30
|
+
puts "\n== " + opts[:title].split(/ /).map{|w| tcase(w)}.join(" ") + " ==\n\n" if opts[:title] && opts[:title].size > 0
|
31
|
+
len = rand_btw(opts[:max_sentence_length],opts[:max_sentence_length])
|
32
|
+
wc = -1
|
33
|
+
s = opts[:s]
|
34
|
+
s.generate_n(opts[:max_wc],opts[:starter]){ |w|
|
35
|
+
((wc+=1) == 0) ? print(tcase(w)) : (((w=="<s>") || (w=="</s>")) ? print("#{w}\n") : print(w))
|
36
|
+
print(" ")
|
37
|
+
$stdout.flush
|
38
|
+
}
|
39
|
+
end
|
40
|
+
|
41
|
+
def main
|
42
|
+
opts = Trollop::options do
|
43
|
+
banner "Synthetic writer"
|
44
|
+
opt :title, "Title for story", :default => ""
|
45
|
+
opt :starter, "Starter text for story", :default => "It was the best of times, it was the worst of times"
|
46
|
+
opt :max_wc, "Max number of words to generate", :default => 1000
|
47
|
+
opt :model, "Corpus model to use", :default => Bing::Ngram.default_model
|
48
|
+
opt :depth, "How deep to look for the next token", :depth => 100
|
49
|
+
opt :max_sentence_length, "Maximum length of sentences to generate", :default => 16
|
50
|
+
opt :min_sentence_length, "Minimum length of sentences to generate", :default => 8
|
51
|
+
opt :debug, "Debug errors", :default=> true
|
52
|
+
end
|
53
|
+
|
54
|
+
opts[:s] = Synthese.new(opts)
|
55
|
+
trap("INT","EXIT")
|
56
|
+
scribble(opts)
|
57
|
+
end
|
58
|
+
|
59
|
+
main
|
60
|
+
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../lib/microsoft_ngram'
|
2
|
+
require 'rubygems'
|
3
|
+
|
4
|
+
class BackoffGenerator
|
5
|
+
attr_accessor :model_type, :models, :debug
|
6
|
+
|
7
|
+
def initialize(model_type="body",max_models=5,debug=false)
|
8
|
+
@model_type = model_type
|
9
|
+
@debug=debug
|
10
|
+
@models = Bing::Ngram::models_find_all(model_type).take(max_models).map{|m|Bing::Ngram.new(:model => m, :debug=>debug)}
|
11
|
+
end
|
12
|
+
|
13
|
+
def generate_list(text, n, initial_text)
|
14
|
+
@models.each do |model|
|
15
|
+
l = model.generate_list(text,n)
|
16
|
+
return l if l.size > 0
|
17
|
+
end
|
18
|
+
if (text != initial_text)
|
19
|
+
generate_list(initial_text, n, initial_text)
|
20
|
+
else
|
21
|
+
[]
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
class Synthese
|
28
|
+
attr_accessor :generator, :depth, :n, :debug
|
29
|
+
|
30
|
+
def initialize(args = {})
|
31
|
+
#puts args.inspect
|
32
|
+
mtype = args[:model_type] || "body"
|
33
|
+
@debug = args[:debug] || false
|
34
|
+
@generator = BackoffGenerator.new(mtype,5,false)
|
35
|
+
@depth = (args[:depth] ? args[:depth].to_i : 100)
|
36
|
+
@n = Bing::ModelSpec.new(@generator.models[0].model).size
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.weighted_random_index l
|
40
|
+
tot = l.inject(0.0){|a,b| a+ b}
|
41
|
+
ns = l.map{|i| i/tot}
|
42
|
+
r = rand
|
43
|
+
c = 0.0
|
44
|
+
ns.each_with_index{|n,i| return i if r < (c+= n)}
|
45
|
+
l.size-1 # PRACTICE SAFETY
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.random_word l
|
49
|
+
l[Synthese.weighted_random_index(l.map{|w,lp| Math.exp(lp)})][0]
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
def generate(text)
|
54
|
+
tokens = text.split(/\s/)
|
55
|
+
tokens.each{|token| yield token}
|
56
|
+
tokens = tokens[-@n..-1] if tokens.size > @n
|
57
|
+
initial_text = text
|
58
|
+
text = tokens.join(' ')
|
59
|
+
while true
|
60
|
+
#puts "OUTER. Generating from : #{text}"
|
61
|
+
l = @generator.generate_list(text,@depth,initial_text)
|
62
|
+
break if l.size == 0
|
63
|
+
w = Synthese.random_word(l)
|
64
|
+
yield w
|
65
|
+
tokens << w
|
66
|
+
tokens = tokens[-@n..-1] if tokens.size > @n
|
67
|
+
text = tokens.join(' ')
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def generate_n(n,text)
|
72
|
+
counter = 0
|
73
|
+
generate(text){|w| yield w; counter += 1; break if counter > n }
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/veritable_lib.rb'
|
2
|
+
require 'rubygems'
|
3
|
+
require 'trollop'
|
4
|
+
|
5
|
+
def main
|
6
|
+
opts = Trollop::options do
|
7
|
+
banner "Generate phrases from Bing's Ngram server based on patterns— specify pattern on command line as alternating strings and ints"
|
8
|
+
opt :max_wild_children, "Max number of wildcard tokens to generate", :default => 20
|
9
|
+
opt :max_literal_children, "Max number of tokens to generate after literal strings", :default => 100
|
10
|
+
opt :model, "Corpus model to use", :default => Bing::Ngram.default_model
|
11
|
+
opt :max_length, "Maximum length of phrases to generate", :default => 5
|
12
|
+
opt :verbose, "Send logging messages to $STDERR", :default => false
|
13
|
+
end
|
14
|
+
|
15
|
+
pattern = []
|
16
|
+
ARGV.each_with_index{|item, i| pattern << ((i % 2 == 0) ? item : [item.to_i])}
|
17
|
+
if opts[:verbose]
|
18
|
+
$stderr.puts opts.inspect
|
19
|
+
$stderr.puts pattern.inspect
|
20
|
+
end
|
21
|
+
R.new(pattern, opts).generate{|t| $stdout.puts(t.join("\t"))}
|
22
|
+
end
|
23
|
+
|
24
|
+
trap("INT","EXIT")
|
25
|
+
main
|
26
|
+
|
27
|
+
# examples
|
28
|
+
# veritable.rb "veritable" 2 "of" 2
|
29
|
+
# veritable.rb --max-literal-children 50 "a taste of" 2
|
@@ -0,0 +1,121 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../lib/microsoft_ngram'
|
2
|
+
|
3
|
+
# S comtains the state we use to move query the model.
|
4
|
+
#
|
5
|
+
class S
|
6
|
+
attr_accessor :prefix, :pattern, :max_wild_children, :max_literal_children, :max_length, :cp, :model
|
7
|
+
def initialize(prefix, pattern, max_wild_children, max_literal_children, max_length, cp, model)
|
8
|
+
@prefix = prefix
|
9
|
+
@pattern = pattern
|
10
|
+
@max_wild_children = max_wild_children
|
11
|
+
@max_literal_children = max_literal_children
|
12
|
+
@cp = cp
|
13
|
+
@model = model
|
14
|
+
@max_length = max_length
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_s
|
18
|
+
"S(#{@prefix.inspect}:#{@pattern.inspect}:#{cp})"
|
19
|
+
end
|
20
|
+
|
21
|
+
def children
|
22
|
+
return [] if @pattern.size == 0
|
23
|
+
return [] if @prefix.split.size >= @max_length
|
24
|
+
hd = @pattern.first
|
25
|
+
tail = @pattern[1..-1]
|
26
|
+
# puts "pattern: #{@pattern.inspect}; hd: #{hd.inspect}; tail: #{tail.inspect}"
|
27
|
+
if hd.class == String
|
28
|
+
string_children(hd, tail)
|
29
|
+
else
|
30
|
+
wildcard_children(hd, tail)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def string_children(hd, tail)
|
35
|
+
query = @prefix.size > 0 ? "#{@prefix} #{@pattern[0]}" : @pattern[0]
|
36
|
+
pairs = @model.generate_list(query,@max_literal_children)
|
37
|
+
pairs.map{|w,cp| S.new("#{query} #{w}", tail, @max_wild_children, @max_literal_children, @max_length, cp, @model)}
|
38
|
+
end
|
39
|
+
|
40
|
+
def wildcard_children(hd, tail)
|
41
|
+
n = @pattern[0][0]
|
42
|
+
return [] if n<1 # the 'one' was generated by the string children or the previous iteration
|
43
|
+
pairs = @model.generate_list(@prefix,@max_wild_children)
|
44
|
+
pairs.map{|w,cp| S.new("#{@prefix} #{w}",([[n-1]] + tail),@max_wild_children, @max_literal_children, @max_length, cp, @model)}
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# R is the machine that generates most likely phrases with wildcards
|
49
|
+
# it is intialized with
|
50
|
+
# pattern = an array of alternating string and size requirements specified as [i]
|
51
|
+
# eg ["veritable", [2], "of" [2]] -> look for 'veritable', then up to 2 tokens then 'of', then up to two tokens
|
52
|
+
# it has to be at least [1]
|
53
|
+
# max_wild_children : for wildcards (size requirement), the maximum number to inspect at this point
|
54
|
+
# max_literal_children: for strings, the max number of tokens to inspect
|
55
|
+
# model: the corpus in bing format, eg "bing-body/apr10/5". Leave nil to use the 'best' (latest, longest, body)
|
56
|
+
# verbose: if true, a few messages printed to $stderr
|
57
|
+
class R
|
58
|
+
attr_accessor :pattern, :regex, :max_wild_children, :max_literal_children, :max_length, :model, :verbose
|
59
|
+
|
60
|
+
def initialize(pattern, args = {})
|
61
|
+
@pattern = pattern
|
62
|
+
raise "Pattern must be array" unless @pattern.class == Array
|
63
|
+
raise "First item #{@pattern[0].inspect} must be a string" unless @pattern[0].class == String
|
64
|
+
unless pattern.all?{|el| el.class==String || (el.class==Array && el[0].class==Fixnum)}
|
65
|
+
raise "Pattern items must be strings or arrays of numbers"
|
66
|
+
end
|
67
|
+
@model = !args[:model] ? Bing::Ngram.new : (args[:model].class==String ? Bing::Ngram.new(args[:model]) : args[:model])
|
68
|
+
@max_wild_children = args[:max_wild_children] || 20
|
69
|
+
@max_literal_children = args[:max_literal_children] || 100
|
70
|
+
@max_length = args[:max_length] || Bing::CorpusSpec.new(@model.model).size
|
71
|
+
@verbose = args[:verbose] || false
|
72
|
+
@regex = pattern_to_regex(pattern)
|
73
|
+
end
|
74
|
+
|
75
|
+
# converts the pattern to a reg expression
|
76
|
+
def pattern_to_regex(pattern)
|
77
|
+
def array_to_regex(n)
|
78
|
+
case n
|
79
|
+
when 1
|
80
|
+
"\\S+"
|
81
|
+
else
|
82
|
+
"(?:\\S+ ){0,#{n-1}}\\S+"
|
83
|
+
end
|
84
|
+
end
|
85
|
+
str = pattern.map do |el|
|
86
|
+
el.class==String ? el : array_to_regex(el[0])
|
87
|
+
end.join(" ")
|
88
|
+
Regexp.new(str)
|
89
|
+
end
|
90
|
+
|
91
|
+
# does the generated phrase match?
|
92
|
+
def matches(str)
|
93
|
+
(@regex =~ str) && true
|
94
|
+
end
|
95
|
+
|
96
|
+
# generate -- yields tuples of prefix, cp, and jp
|
97
|
+
def generate
|
98
|
+
stack = []
|
99
|
+
s = S.new("",@pattern, @max_wild_children, @max_literal_children, @max_length, 0.0, @model)
|
100
|
+
$stderr.puts "Generate:Initializing.. #{s.inspect}." if @verbose
|
101
|
+
stack.push(s)
|
102
|
+
until stack.empty?
|
103
|
+
current = stack.pop
|
104
|
+
$stderr.puts("Current state: #{current}; stack size: #{stack.size}") if @verbose
|
105
|
+
if matches(current.prefix)
|
106
|
+
jp = @model.jp(current.prefix)
|
107
|
+
yield [current.prefix, current.cp, jp]
|
108
|
+
else
|
109
|
+
$stderr.puts("Not a match: #{current.prefix.inspect}") if @verbose
|
110
|
+
end
|
111
|
+
current.children.each{|child| stack.push(child); $stderr.puts "Added child: #{child}" if @verbse}
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
|
117
|
+
|
118
|
+
|
119
|
+
# examples
|
120
|
+
# #x = R.new([ "veritable", [2], "of", [2] ], 20, 100, 5)
|
121
|
+
|
data/lib/microsoft_ngram.rb
CHANGED
@@ -1,8 +1,41 @@
|
|
1
|
-
require
|
1
|
+
require 'rubygems'
|
2
2
|
require "rest-client"
|
3
3
|
|
4
4
|
module Bing
|
5
|
-
|
5
|
+
|
6
|
+
# this class is only used to find the best default model,
|
7
|
+
# that is, for the default_model call
|
8
|
+
class ModelSpec
|
9
|
+
|
10
|
+
attr_accessor :model_type, :date, :size
|
11
|
+
|
12
|
+
def initialize (spec)
|
13
|
+
def parse_month(m)
|
14
|
+
case m
|
15
|
+
when 'jan': '01'
|
16
|
+
when 'feb': '02'
|
17
|
+
when 'mar': '03'
|
18
|
+
when 'apr': '04'
|
19
|
+
when 'may': '05'
|
20
|
+
when 'jun': '06'
|
21
|
+
when 'jul': '07'
|
22
|
+
when 'aug': '08'
|
23
|
+
when 'sep': '09'
|
24
|
+
when 'oct': '10'
|
25
|
+
when 'nov': '11'
|
26
|
+
when 'dec': '12'
|
27
|
+
else '??'
|
28
|
+
end
|
29
|
+
end
|
30
|
+
parts = spec.split('/')
|
31
|
+
@model_type = parts[0].split('-')[1] # 'bing-body'
|
32
|
+
yr = parts[1].split(/\D/)[-1].to_i + 2000
|
33
|
+
month = parse_month(parts[1].split(/\d/)[0])
|
34
|
+
@date = "#{yr}-#{month}"
|
35
|
+
@size = parts[2].to_i
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
6
39
|
class Ngram
|
7
40
|
|
8
41
|
@@endpoint = "http://web-ngram.research.microsoft.com/rest/lookup.svc/"
|
@@ -17,6 +50,26 @@ module Bing
|
|
17
50
|
@@models.include?(model)
|
18
51
|
end
|
19
52
|
|
53
|
+
def self.default_model(model_type='body') #most recent, longest
|
54
|
+
Bing::Ngram.models() if @@models == nil # cache the current models
|
55
|
+
ms = @@models.
|
56
|
+
map{|x| [ModelSpec.new(x),x]}.
|
57
|
+
find_all{|c| c[0].model_type == model_type}.
|
58
|
+
sort_by{|c| [c[0].date, c[0].size]}
|
59
|
+
@@default_model = ms.size > 0 ? ms[-1][1] : nil
|
60
|
+
end
|
61
|
+
|
62
|
+
def self.models_find_all(model_type='body',min_size=1)
|
63
|
+
Bing::Ngram.models() if @@models == nil # cache the current models
|
64
|
+
ms = @@models.
|
65
|
+
map{|x| [ModelSpec.new(x),x]}.
|
66
|
+
find_all{|c| c[0].model_type == model_type}.
|
67
|
+
find_all{|c| c[0].size >= min_size}.
|
68
|
+
sort_by{|c| [c[0].date, c[0].size]}.
|
69
|
+
map{|spec,m| m}.
|
70
|
+
reverse
|
71
|
+
end
|
72
|
+
|
20
73
|
attr_accessor :user_token
|
21
74
|
# The model is the current model. Query this.models() for available models
|
22
75
|
attr_accessor :model
|
@@ -24,25 +77,28 @@ module Bing
|
|
24
77
|
attr_accessor :debug
|
25
78
|
# Ngram size based on model
|
26
79
|
attr_accessor :ngram_size
|
27
|
-
|
80
|
+
|
28
81
|
def initialize(args = {})
|
29
82
|
@user_token = args["user_token"] || args[:user_token] || ENV["NGRAM_TOKEN"]
|
30
83
|
unless @user_token
|
31
84
|
raise "Must provide user token as NGRAM_TOKEN env variable or as :user_token => token. To get a token, see http://web-ngram.research.microsoft.com/info/ "
|
32
85
|
end
|
33
86
|
# probably shouldn't change
|
34
|
-
@model = args["model"] || args[:model] || Bing::Ngram.
|
87
|
+
@model = args["model"] || args[:model] || Bing::Ngram.default_model()
|
35
88
|
unless Bing::Ngram.defined_model?(@model)
|
36
89
|
raise "Invalid model: #{@model}. Valid models are #{@@models.join('; ')}"
|
37
90
|
end
|
38
91
|
@debug = (args["debug"] || args[:debug] || nil)
|
92
|
+
#puts "Creating #{@model.inspect} with debug=#{@debug}"
|
39
93
|
@ngram_size = @model.split(/\//)[-1].to_i
|
40
94
|
end
|
41
95
|
|
96
|
+
|
97
|
+
|
42
98
|
def get(op,phrase,args)
|
43
99
|
model = args["model"] || args[:model] || @model
|
44
100
|
RestClient.get(@@endpoint + model + '/' + op, {:params => {:u => @user_token, :p => phrase}.merge(args)}) do |res,req,result|
|
45
|
-
$stderr.puts
|
101
|
+
$stderr.puts res.inspect if @debug
|
46
102
|
res
|
47
103
|
end
|
48
104
|
end
|
@@ -50,7 +106,7 @@ module Bing
|
|
50
106
|
def post(op,phrases,args)
|
51
107
|
model = args["model"] || args[:model] || @model
|
52
108
|
RestClient.post(@@endpoint + model + '/' + op + "?u=#{@user_token}", phrases.join("\n")) do |res,req,result|
|
53
|
-
$stderr.puts
|
109
|
+
$stderr.puts res.inspect if @debug
|
54
110
|
res
|
55
111
|
end
|
56
112
|
end
|
@@ -89,6 +145,13 @@ module Bing
|
|
89
145
|
end
|
90
146
|
end
|
91
147
|
end
|
148
|
+
|
149
|
+
# get a list of the next most popular tokens with log-freq
|
150
|
+
def generate_list(phrase, max_length)
|
151
|
+
l = []
|
152
|
+
generate(phrase,max_length){|p| l << p}
|
153
|
+
l
|
154
|
+
end
|
92
155
|
|
93
156
|
# spell-checking
|
94
157
|
# Bing::Ngram.new(:debug=>nil,:model=>'bing-body/jun09/1').jps(edits1("appresiate").uniq).sort{|a,b| b[1] <=> a[1]}[0..30]
|
data/microsoft_ngram.gemspec
CHANGED
metadata
CHANGED
@@ -1,70 +1,106 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: microsoft_ngram
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 23
|
5
5
|
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 4
|
10
|
+
version: 0.0.4
|
6
11
|
platform: ruby
|
7
|
-
authors:
|
12
|
+
authors:
|
8
13
|
- Will Fitzgerald
|
9
14
|
- Zeke Sikelianos
|
10
15
|
autorequire:
|
11
16
|
bindir: bin
|
12
17
|
cert_chain: []
|
13
|
-
|
14
|
-
|
15
|
-
|
18
|
+
|
19
|
+
date: 2012-02-09 00:00:00 Z
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
16
22
|
name: hoe
|
17
|
-
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
18
25
|
none: false
|
19
|
-
requirements:
|
20
|
-
- -
|
21
|
-
- !ruby/object:Gem::Version
|
22
|
-
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
30
|
+
segments:
|
31
|
+
- 0
|
32
|
+
version: "0"
|
23
33
|
type: :development
|
24
|
-
|
25
|
-
|
26
|
-
- !ruby/object:Gem::Dependency
|
34
|
+
version_requirements: *id001
|
35
|
+
- !ruby/object:Gem::Dependency
|
27
36
|
name: rspec
|
28
|
-
|
37
|
+
prerelease: false
|
38
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
29
39
|
none: false
|
30
|
-
requirements:
|
40
|
+
requirements:
|
31
41
|
- - ~>
|
32
|
-
- !ruby/object:Gem::Version
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
hash: 47
|
44
|
+
segments:
|
45
|
+
- 2
|
46
|
+
- 8
|
47
|
+
- 0
|
33
48
|
version: 2.8.0
|
34
49
|
type: :development
|
35
|
-
|
36
|
-
|
37
|
-
- !ruby/object:Gem::Dependency
|
50
|
+
version_requirements: *id002
|
51
|
+
- !ruby/object:Gem::Dependency
|
38
52
|
name: autotest
|
39
|
-
|
53
|
+
prerelease: false
|
54
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
40
55
|
none: false
|
41
|
-
requirements:
|
42
|
-
- -
|
43
|
-
- !ruby/object:Gem::Version
|
44
|
-
|
56
|
+
requirements:
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
hash: 3
|
60
|
+
segments:
|
61
|
+
- 0
|
62
|
+
version: "0"
|
45
63
|
type: :development
|
46
|
-
|
47
|
-
|
48
|
-
- !ruby/object:Gem::Dependency
|
64
|
+
version_requirements: *id003
|
65
|
+
- !ruby/object:Gem::Dependency
|
49
66
|
name: rest-client
|
50
|
-
|
67
|
+
prerelease: false
|
68
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
51
69
|
none: false
|
52
|
-
requirements:
|
53
|
-
- -
|
54
|
-
- !ruby/object:Gem::Version
|
55
|
-
|
70
|
+
requirements:
|
71
|
+
- - ">="
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
hash: 3
|
74
|
+
segments:
|
75
|
+
- 0
|
76
|
+
version: "0"
|
56
77
|
type: :runtime
|
78
|
+
version_requirements: *id004
|
79
|
+
- !ruby/object:Gem::Dependency
|
80
|
+
name: trollop
|
57
81
|
prerelease: false
|
58
|
-
|
82
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
83
|
+
none: false
|
84
|
+
requirements:
|
85
|
+
- - ">="
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
hash: 3
|
88
|
+
segments:
|
89
|
+
- 0
|
90
|
+
version: "0"
|
91
|
+
type: :runtime
|
92
|
+
version_requirements: *id005
|
59
93
|
description: A simple wrapper for Bing's ngram API
|
60
|
-
email:
|
94
|
+
email:
|
61
95
|
- will@wordnik.com
|
62
96
|
- zeke@sikelianos.com
|
63
|
-
executables:
|
97
|
+
executables:
|
64
98
|
- microsoft_ngram
|
65
99
|
extensions: []
|
100
|
+
|
66
101
|
extra_rdoc_files: []
|
67
|
-
|
102
|
+
|
103
|
+
files:
|
68
104
|
- .gitignore
|
69
105
|
- .rspec
|
70
106
|
- Gemfile
|
@@ -74,6 +110,10 @@ files:
|
|
74
110
|
- Rakefile
|
75
111
|
- bin/microsoft_ngram
|
76
112
|
- examples/segment.rb
|
113
|
+
- examples/synthese.rb
|
114
|
+
- examples/synthese_lib.rb
|
115
|
+
- examples/veritable.rb
|
116
|
+
- examples/veritable_lib.rb
|
77
117
|
- lib/microsoft_ngram.rb
|
78
118
|
- lib/microsoft_ngram/version.rb
|
79
119
|
- microsoft_ngram.gemspec
|
@@ -81,26 +121,36 @@ files:
|
|
81
121
|
- spec/spec_helper.rb
|
82
122
|
homepage: http://developer.wordnik.com
|
83
123
|
licenses: []
|
124
|
+
|
84
125
|
post_install_message:
|
85
126
|
rdoc_options: []
|
86
|
-
|
127
|
+
|
128
|
+
require_paths:
|
87
129
|
- lib
|
88
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
130
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
89
131
|
none: false
|
90
|
-
requirements:
|
91
|
-
- -
|
92
|
-
- !ruby/object:Gem::Version
|
93
|
-
|
94
|
-
|
132
|
+
requirements:
|
133
|
+
- - ">="
|
134
|
+
- !ruby/object:Gem::Version
|
135
|
+
hash: 3
|
136
|
+
segments:
|
137
|
+
- 0
|
138
|
+
version: "0"
|
139
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
95
140
|
none: false
|
96
|
-
requirements:
|
97
|
-
- -
|
98
|
-
- !ruby/object:Gem::Version
|
99
|
-
|
141
|
+
requirements:
|
142
|
+
- - ">="
|
143
|
+
- !ruby/object:Gem::Version
|
144
|
+
hash: 3
|
145
|
+
segments:
|
146
|
+
- 0
|
147
|
+
version: "0"
|
100
148
|
requirements: []
|
149
|
+
|
101
150
|
rubyforge_project: microsoft_ngram
|
102
|
-
rubygems_version: 1.8.
|
151
|
+
rubygems_version: 1.8.13
|
103
152
|
signing_key:
|
104
153
|
specification_version: 3
|
105
154
|
summary: A simple wrapper for Bing's ngram API
|
106
155
|
test_files: []
|
156
|
+
|