zipf 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/zipf.rb +18 -0
- data/lib/zipf/SparseVector.rb +172 -0
- data/lib/zipf/Translation.rb +72 -0
- data/lib/zipf/bleu.rb +130 -0
- data/lib/zipf/dag.rb +205 -0
- data/lib/zipf/fileutil.rb +88 -0
- data/lib/zipf/grammar.rb +123 -0
- data/lib/zipf/hg.rb +173 -0
- data/lib/zipf/misc.rb +114 -0
- data/lib/zipf/semirings.rb +81 -0
- data/lib/zipf/stringutil.rb +22 -0
- data/lib/zipf/tfidf.rb +38 -0
- metadata +55 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: e8021bb7a07d98332028ec75ff1c3bf53149cab3
|
4
|
+
data.tar.gz: 958c53844c7e0b1b76b44bcc26fe60736449a1cd
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: c444514cec3f6154c9011db7aac92b579e046dea3c88e24781db728f46cb67c1e789c50007edbc4dfe202c448a86b158f170ac3a6baf5cd4cae4ff5fd422b5c7
|
7
|
+
data.tar.gz: 43c54fa8adf44ef26d0894bb00a8de9e8ae30cc79efacbf40d20f7c8bc116a6cbecd2f641ccc23c582ddb4e8dc69627958d7b79c57f6fd1cd5f250fed3df6c50
|
data/lib/zipf.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'zipf/stringutil'
|
4
|
+
require 'zipf/fileutil'
|
5
|
+
require 'zipf/SparseVector'
|
6
|
+
require 'zipf/tfidf'
|
7
|
+
require 'zipf/Translation'
|
8
|
+
require 'zipf/dag'
|
9
|
+
require 'zipf/semirings'
|
10
|
+
require 'zipf/bleu'
|
11
|
+
require 'zipf/misc'
|
12
|
+
require 'zipf/hg'
|
13
|
+
require 'zipf/grammar'
|
14
|
+
|
15
|
+
STDIN.set_encoding 'utf-8'
|
16
|
+
STDOUT.set_encoding 'utf-8'
|
17
|
+
STDERR.set_encoding 'utf-8'
|
18
|
+
|
@@ -0,0 +1,172 @@
|
|
1
|
+
class SparseVector < Hash
|
2
|
+
|
3
|
+
def initialize arg=nil
|
4
|
+
super
|
5
|
+
self.default = 0
|
6
|
+
if arg.is_a? Array
|
7
|
+
from_a arg
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
def from_a a
|
12
|
+
a.each_with_index { |i,j| self[j] = i }
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.from_a a
|
16
|
+
v = SparseVector.new
|
17
|
+
v.from_a a
|
18
|
+
return v
|
19
|
+
end
|
20
|
+
|
21
|
+
def from_h h
|
22
|
+
h.each_pair { |k,v| self[k] = v }
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.from_h h
|
26
|
+
v = SparseVector.new
|
27
|
+
v.from_h h
|
28
|
+
return v
|
29
|
+
end
|
30
|
+
|
31
|
+
def from_s s
|
32
|
+
from_h eval(s)
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.from_s s
|
36
|
+
v = SparseVector.new
|
37
|
+
v.from_s s
|
38
|
+
return v
|
39
|
+
end
|
40
|
+
|
41
|
+
def to_kv sep='=', join=' '
|
42
|
+
a = []
|
43
|
+
self.each_pair { |k,v|
|
44
|
+
a << "#{k}#{sep}#{v}"
|
45
|
+
}
|
46
|
+
return a.join join
|
47
|
+
end
|
48
|
+
|
49
|
+
def from_kv s
|
50
|
+
s.split.each { |i|
|
51
|
+
k,v = i.split('=')
|
52
|
+
self[k] = v.to_f
|
53
|
+
}
|
54
|
+
end
|
55
|
+
|
56
|
+
def self.from_kv s
|
57
|
+
v = SparseVector.new
|
58
|
+
v.from_kv s
|
59
|
+
return v
|
60
|
+
end
|
61
|
+
|
62
|
+
def from_file fn, sep='='
|
63
|
+
f = ReadFile.new(fn)
|
64
|
+
while line = f.gets
|
65
|
+
key, value = line.strip.split sep
|
66
|
+
value = value.to_f
|
67
|
+
self[key] = value
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def self.from_file fn, sep='='
|
72
|
+
v = SparseVector.new
|
73
|
+
v.from_file fn, sep
|
74
|
+
return v
|
75
|
+
end
|
76
|
+
|
77
|
+
def join_keys other
|
78
|
+
self.keys + other.keys
|
79
|
+
end
|
80
|
+
|
81
|
+
def sum
|
82
|
+
self.values.inject(:+)
|
83
|
+
end
|
84
|
+
|
85
|
+
def approx_eql? other, p=10**-10
|
86
|
+
return false if !other
|
87
|
+
return false if other.size!=self.size
|
88
|
+
return false if other.keys.sort!=self.keys.sort
|
89
|
+
self.keys.each { |k|
|
90
|
+
return false if (self[k]-other[k]).abs>p
|
91
|
+
}
|
92
|
+
return true
|
93
|
+
end
|
94
|
+
|
95
|
+
def average
|
96
|
+
self.sum/self.size.to_f
|
97
|
+
end
|
98
|
+
|
99
|
+
def variance
|
100
|
+
avg = self.average
|
101
|
+
var = 0.0
|
102
|
+
self.values.each { |i| var += (avg - i)**2 }
|
103
|
+
return var
|
104
|
+
end
|
105
|
+
|
106
|
+
def stddev
|
107
|
+
Math.sqrt self.variance
|
108
|
+
end
|
109
|
+
|
110
|
+
def dot other
|
111
|
+
sum = 0.0
|
112
|
+
self.each_pair { |k,v| sum += v * other[k] }
|
113
|
+
return sum
|
114
|
+
end
|
115
|
+
|
116
|
+
def zeros n
|
117
|
+
(0).upto(n-1) { |i| self[i] = 0.0 }
|
118
|
+
end
|
119
|
+
|
120
|
+
def magnitude
|
121
|
+
Math.sqrt self.values.inject { |sum,i| sum+i**2 }
|
122
|
+
end
|
123
|
+
|
124
|
+
def cosinus_sim other
|
125
|
+
self.dot(other)/(self.magnitude*other.magnitude)
|
126
|
+
end
|
127
|
+
|
128
|
+
def euclidian_dist other
|
129
|
+
dims = [self.keys, other.keys].flatten.uniq
|
130
|
+
sum = 0.0
|
131
|
+
dims.each { |d| sum += (self[d] - other[d])**2 }
|
132
|
+
return Math.sqrt(sum)
|
133
|
+
end
|
134
|
+
|
135
|
+
def + other
|
136
|
+
new = SparseVector.new
|
137
|
+
join_keys(other).each { |k|
|
138
|
+
new[k] = self[k]+other[k]
|
139
|
+
}
|
140
|
+
return new
|
141
|
+
end
|
142
|
+
|
143
|
+
def - other
|
144
|
+
new = SparseVector.new
|
145
|
+
join_keys(other).each { |k|
|
146
|
+
new[k] = self[k]-other[k]
|
147
|
+
}
|
148
|
+
return new
|
149
|
+
end
|
150
|
+
|
151
|
+
def * scalar
|
152
|
+
raise ArgumentError, "Arg is not numeric #{scalar}" unless scalar.is_a? Numeric
|
153
|
+
new = SparseVector.new
|
154
|
+
self.keys.each { |k|
|
155
|
+
new[k] = self[k] * scalar
|
156
|
+
}
|
157
|
+
return new
|
158
|
+
end
|
159
|
+
|
160
|
+
def self.mean a
|
161
|
+
mean = SparseVector.new
|
162
|
+
a.each { |i|
|
163
|
+
i.each_pair { |k,v|
|
164
|
+
mean[k] += v
|
165
|
+
}
|
166
|
+
}
|
167
|
+
n = a.size.to_f
|
168
|
+
mean.each_pair { |k,v| mean[k] = v/n }
|
169
|
+
return mean
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
@@ -0,0 +1,72 @@
|
|
1
|
+
class Translation
|
2
|
+
attr_accessor :id, :s, :raw, :f, :scores, :rank
|
3
|
+
|
4
|
+
def initialize id=nil, raw=nil, s=nil, f=nil, scores={}, rank=nil
|
5
|
+
@id = id
|
6
|
+
@raw = raw
|
7
|
+
@s = s
|
8
|
+
@f = f
|
9
|
+
@scores = scores
|
10
|
+
@rank = rank
|
11
|
+
end
|
12
|
+
|
13
|
+
def from_s t, strip_alignment=true, rank=nil
|
14
|
+
id, raw, features, score = splitpipe(t, 3)
|
15
|
+
raw.strip!
|
16
|
+
@raw = raw
|
17
|
+
if strip_alignment # the way moses does it
|
18
|
+
@s = @raw.gsub(/\s*\|\d+-\d+\||\|-?\d+\|\s*/, ' ').gsub(/\s+/, ' ')
|
19
|
+
@s.strip!
|
20
|
+
else
|
21
|
+
@s = raw
|
22
|
+
end
|
23
|
+
@id = id.to_i
|
24
|
+
@f = SparseVector.from_kv features
|
25
|
+
@scores[:decoder] = score.to_f
|
26
|
+
@rank = rank
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.from_s s
|
30
|
+
t = self.new
|
31
|
+
t.from_s s
|
32
|
+
return t
|
33
|
+
end
|
34
|
+
|
35
|
+
def to_s include_features=true
|
36
|
+
[@id, @s, @f.to_kv('=', ' '), @scores[:decoder]].join(' ||| ') if include_features
|
37
|
+
[@id, @s, @scores[:decoder]].join(' ||| ') if !include_features
|
38
|
+
end
|
39
|
+
|
40
|
+
def to_s2
|
41
|
+
[@rank, @s, @score, @scores.to_s].join ' ||| '
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def read_kbest_lists fn, translation_type=Translation
|
46
|
+
kbest_lists = []
|
47
|
+
cur = []
|
48
|
+
f = ReadFile.new fn
|
49
|
+
prev = -1
|
50
|
+
c = 0
|
51
|
+
id = 0
|
52
|
+
while line = f.gets
|
53
|
+
t = translation_type.new
|
54
|
+
t.from_s line
|
55
|
+
c = splitpipe(line)[0].to_i
|
56
|
+
if c != prev
|
57
|
+
if cur.size > 0
|
58
|
+
kbest_lists << cur
|
59
|
+
cur = []
|
60
|
+
end
|
61
|
+
prev = c
|
62
|
+
id = 0
|
63
|
+
end
|
64
|
+
t.id = id
|
65
|
+
cur << t
|
66
|
+
id += 1
|
67
|
+
end
|
68
|
+
kbest_lists << cur # last one
|
69
|
+
f.close
|
70
|
+
return kbest_lists
|
71
|
+
end
|
72
|
+
|
data/lib/zipf/bleu.rb
ADDED
@@ -0,0 +1,130 @@
|
|
1
|
+
module BLEU
|
2
|
+
|
3
|
+
|
4
|
+
class BLEU::NgramCounts
|
5
|
+
attr_accessor :sum, :clipped, :ref_len, :hyp_len, :n
|
6
|
+
|
7
|
+
def initialize(n)
|
8
|
+
@n = 0
|
9
|
+
@sum = []
|
10
|
+
@clipped = []
|
11
|
+
@ref_len = 0.0
|
12
|
+
@hyp_len = 0.0
|
13
|
+
grow(n)
|
14
|
+
end
|
15
|
+
|
16
|
+
def grow(n)
|
17
|
+
(n-@n).times {
|
18
|
+
@sum << 0.0
|
19
|
+
@clipped << 0.0
|
20
|
+
}
|
21
|
+
@n = n
|
22
|
+
end
|
23
|
+
|
24
|
+
def plus_eq(other)
|
25
|
+
if other.n > @n then grow(other.n) end
|
26
|
+
0.upto(other.n-1) { |m|
|
27
|
+
@sum[m] += other.sum[m]
|
28
|
+
@clipped[m] += other.clipped[m]
|
29
|
+
}
|
30
|
+
@ref_len += other.ref_len
|
31
|
+
@hyp_len += other.hyp_len
|
32
|
+
end
|
33
|
+
|
34
|
+
def to_s
|
35
|
+
return "n=#{n} sum=#{sum} clipped=#{clipped} ref_len=#{ref_len} hyp_len=#{hyp_len}"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
class BLEU::Ngrams
|
40
|
+
def initialize
|
41
|
+
@h_ = {}
|
42
|
+
@h_.default = 0
|
43
|
+
end
|
44
|
+
|
45
|
+
def add(k)
|
46
|
+
if k.class == Array then k = k.join ' ' end
|
47
|
+
@h_[k] += 1
|
48
|
+
end
|
49
|
+
|
50
|
+
def get_count(k)
|
51
|
+
if k.class == Array then k = k.join ' ' end
|
52
|
+
return @h_[k]
|
53
|
+
end
|
54
|
+
|
55
|
+
def each
|
56
|
+
@h_.each_pair { |k,v|
|
57
|
+
yield k.split, v
|
58
|
+
}
|
59
|
+
end
|
60
|
+
|
61
|
+
def to_s
|
62
|
+
@h_.to_s
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def BLEU::get_counts hypothesis, reference, n, times=1
|
67
|
+
p = NgramCounts.new n
|
68
|
+
r = Ngrams.new
|
69
|
+
ngrams(reference, n) { |ng| r.add ng }
|
70
|
+
h = Ngrams.new
|
71
|
+
ngrams(hypothesis, n) { |ng| h.add ng }
|
72
|
+
h.each { |ng,count|
|
73
|
+
sz = ng.size-1
|
74
|
+
p.sum[sz] += count * times
|
75
|
+
p.clipped[sz] += [r.get_count(ng), count].min * times
|
76
|
+
}
|
77
|
+
p.ref_len = tokenize(reference.strip).size * times
|
78
|
+
p.hyp_len = tokenize(hypothesis.strip).size * times
|
79
|
+
return p
|
80
|
+
end
|
81
|
+
|
82
|
+
def BLEU::brevity_penalty c, r, smooth=0.0
|
83
|
+
return [0.0, 1.0-((r+smooth)/c)].min
|
84
|
+
end
|
85
|
+
|
86
|
+
def BLEU::bleu counts, n, debug=false
|
87
|
+
corpus_stats = NgramCounts.new n
|
88
|
+
counts.each { |i| corpus_stats.plus_eq i }
|
89
|
+
logbleu = 0.0
|
90
|
+
0.upto(n-1) { |m|
|
91
|
+
STDERR.write "#{m+1} #{corpus_stats.clipped[m]} / #{corpus_stats.sum[m]}\n" if debug
|
92
|
+
return 0.0 if corpus_stats.clipped[m] == 0 or corpus_stats.sum == 0
|
93
|
+
logbleu += Math.log(corpus_stats.clipped[m]) - Math.log(corpus_stats.sum[m])
|
94
|
+
}
|
95
|
+
logbleu /= n
|
96
|
+
if debug
|
97
|
+
STDERR.write "BP #{brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len)}\n"
|
98
|
+
STDERR.write "sum #{Math.exp(sum)}\n"
|
99
|
+
end
|
100
|
+
logbleu += brevity_penalty corpus_stats.hyp_len, corpus_stats.ref_len
|
101
|
+
return Math.exp logbleu
|
102
|
+
end
|
103
|
+
|
104
|
+
def BLEU::hbleu counts, n, debug=false
|
105
|
+
(100*bleu(counts, n, debug)).round(3)
|
106
|
+
end
|
107
|
+
|
108
|
+
def BLEU::per_sentence_bleu hypothesis, reference, n=4, smooth=0.0
|
109
|
+
h_ng = {}; r_ng = {}
|
110
|
+
(1).upto(n) { |i| h_ng[i] = []; r_ng[i] = [] }
|
111
|
+
ngrams(hypothesis, n) { |i| h_ng[i.size] << i }
|
112
|
+
ngrams(reference, n) { |i| r_ng[i.size] << i }
|
113
|
+
m = [n, reference.split.size].min
|
114
|
+
add = 0.0
|
115
|
+
logbleu = 0.0
|
116
|
+
(1).upto(m) { |i|
|
117
|
+
counts_clipped = 0
|
118
|
+
counts_sum = h_ng[i].size
|
119
|
+
h_ng[i].uniq.each { |j| counts_clipped += r_ng[i].count(j) }
|
120
|
+
add = 1.0 if i >= 2
|
121
|
+
logbleu += Math.log(counts_clipped+add) - Math.log(counts_sum+add);
|
122
|
+
}
|
123
|
+
logbleu /= m
|
124
|
+
logbleu += brevity_penalty hypothesis.strip.split.size, reference.strip.split.size, smooth
|
125
|
+
return Math.exp logbleu
|
126
|
+
end
|
127
|
+
|
128
|
+
|
129
|
+
end #module
|
130
|
+
|
data/lib/zipf/dag.rb
ADDED
@@ -0,0 +1,205 @@
|
|
1
|
+
module DAG
|
2
|
+
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
|
6
|
+
class DAG::Node
|
7
|
+
attr_accessor :label, :outgoing, :incoming, :score, :mark
|
8
|
+
|
9
|
+
def initialize label=nil, outgoing=[], incoming=[], score=nil
|
10
|
+
@label = label
|
11
|
+
@outgoing = outgoing
|
12
|
+
@incoming = incoming
|
13
|
+
@score = nil
|
14
|
+
end
|
15
|
+
|
16
|
+
def add_edge head, weight=0
|
17
|
+
exit if self==head # no self-cycles!
|
18
|
+
@outgoing << DAG::Edge.new(self, head, weight)
|
19
|
+
return @outgoing.last
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_s
|
23
|
+
"DAG::Node<label:#{label}, outgoing:#{outgoing.size}, incoming:#{incoming.size}>"
|
24
|
+
end
|
25
|
+
|
26
|
+
def repr
|
27
|
+
"#{to_s} #{@score} out:#{@outgoing} in:[#{@incoming.map{|e| e.to_s}.join ', '}]"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
class DAG::Edge
|
32
|
+
attr_accessor :tail, :head, :weight, :mark
|
33
|
+
|
34
|
+
def initialize tail=nil, head=nil, weight=0
|
35
|
+
@tail = tail
|
36
|
+
@head = head
|
37
|
+
@weight = weight
|
38
|
+
@mark = false # did we already follow this edge? -- for topological sorting
|
39
|
+
end
|
40
|
+
|
41
|
+
def to_s
|
42
|
+
s = "DAG::Edge<#{@tail} ->[#{weight}] #{@head}"
|
43
|
+
s += " x" if @mark
|
44
|
+
s += ">"
|
45
|
+
s
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# depth-first search
|
50
|
+
# w/o markings as we do not have cycles
|
51
|
+
def DAG::dfs n, target_label
|
52
|
+
return n if n.label==target_label # assumes uniq labels!
|
53
|
+
stack = n.outgoing.map { |i| i.head }
|
54
|
+
while !stack.empty?
|
55
|
+
m = stack.pop
|
56
|
+
return DAG::dfs m, target_label
|
57
|
+
end
|
58
|
+
return nil
|
59
|
+
end
|
60
|
+
|
61
|
+
# breadth-first search
|
62
|
+
# w/o markings as we do not have cycles
|
63
|
+
def DAG::bfs n, target_label
|
64
|
+
queue = [n]
|
65
|
+
while !queue.empty?
|
66
|
+
m = queue.shift
|
67
|
+
return m if m.label==target_label
|
68
|
+
m.outgoing.each { |e| queue << e.head }
|
69
|
+
end
|
70
|
+
return nil
|
71
|
+
end
|
72
|
+
|
73
|
+
# topological sort
|
74
|
+
def DAG::topological_sort graph
|
75
|
+
sorted = []
|
76
|
+
s = graph.reject { |n| !n.incoming.empty? }
|
77
|
+
while !s.empty?
|
78
|
+
sorted << s.shift
|
79
|
+
sorted.last.outgoing.each { |e|
|
80
|
+
e.mark = true
|
81
|
+
s << e.head if e.head.incoming.reject{|f| f.mark}.empty?
|
82
|
+
}
|
83
|
+
end
|
84
|
+
return sorted
|
85
|
+
end
|
86
|
+
|
87
|
+
# initialize graph scores with semiring One
|
88
|
+
def DAG::init graph, semiring, source_node
|
89
|
+
graph.each {|n| n.score=semiring.null}
|
90
|
+
source_node.score = semiring.one
|
91
|
+
end
|
92
|
+
|
93
|
+
# viterbi
|
94
|
+
def DAG::viterbi graph, semiring=ViterbiSemiring, source_node
|
95
|
+
toposorted = DAG::topological_sort(graph)
|
96
|
+
DAG::init(graph, semiring, source_node)
|
97
|
+
toposorted.each { |n|
|
98
|
+
n.incoming.each { |e|
|
99
|
+
# update
|
100
|
+
n.score = \
|
101
|
+
semiring.add.call(n.score, \
|
102
|
+
semiring.multiply.call(e.tail.score, e.weight)
|
103
|
+
)
|
104
|
+
}
|
105
|
+
}
|
106
|
+
end
|
107
|
+
|
108
|
+
# forward viterbi
|
109
|
+
def DAG::viterbi_forward graph, semiring=ViterbiSemiring, source_node
|
110
|
+
toposorted = DAG::topological_sort(graph)
|
111
|
+
DAG::init(graph, semiring, source_node)
|
112
|
+
toposorted.each { |n|
|
113
|
+
n.outgoing.each { |e|
|
114
|
+
e.head.score = \
|
115
|
+
semiring.add.call(e.head.score, \
|
116
|
+
semiring.multiply.call(n.score, e.weight)
|
117
|
+
)
|
118
|
+
}
|
119
|
+
}
|
120
|
+
end
|
121
|
+
|
122
|
+
# Dijkstra algorithm
|
123
|
+
# for A*-search we would need an optimistic estimate of
|
124
|
+
# future cost at each node
|
125
|
+
def DAG::dijkstra graph, semiring=RealSemiring.new, source_node
|
126
|
+
DAG::init(graph, semiring, source_node)
|
127
|
+
q = PriorityQueue.new graph
|
128
|
+
while !q.empty?
|
129
|
+
n = q.pop
|
130
|
+
n.outgoing.each { |e|
|
131
|
+
e.head.score = \
|
132
|
+
semiring.add.call(e.head.score, \
|
133
|
+
semiring.multiply.call(n.score, e.weight))
|
134
|
+
q.sort!
|
135
|
+
}
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
# Bellman-Ford algorithm
|
140
|
+
def DAG::bellman_ford(graph, semiring=RealSemiring.new, source_node)
|
141
|
+
DAG::init(graph, semiring, source_node)
|
142
|
+
edges = []
|
143
|
+
graph.each { |n| edges |= n.outgoing }
|
144
|
+
# relax edges
|
145
|
+
(graph.size-1).times{ |i|
|
146
|
+
edges.each { |e|
|
147
|
+
e.head.score = \
|
148
|
+
semiring.add.call(e.head.score, \
|
149
|
+
semiring.multiply.call(e.tail.score, e.weight))
|
150
|
+
}
|
151
|
+
}
|
152
|
+
# we do not allow cycles (negative or positive)
|
153
|
+
end
|
154
|
+
|
155
|
+
# Floyd algorithm
|
156
|
+
def DAG::floyd(graph, semiring=nil)
|
157
|
+
dist_matrix = []
|
158
|
+
graph.each_index { |i|
|
159
|
+
dist_matrix << []
|
160
|
+
graph.each_index { |j|
|
161
|
+
val = 1.0/0.0
|
162
|
+
val = 0.0 if i==j
|
163
|
+
dist_matrix.last << val
|
164
|
+
}
|
165
|
+
}
|
166
|
+
edges = []
|
167
|
+
graph.each { |n| edges |= n.outgoing }
|
168
|
+
edges.each { |e|
|
169
|
+
dist_matrix[graph.index(e.tail)][graph.index(e.head)] = e.weight
|
170
|
+
}
|
171
|
+
0.upto(graph.size-1) { |k|
|
172
|
+
0.upto(graph.size-1) { |i|
|
173
|
+
0.upto(graph.size-1) { |j|
|
174
|
+
if dist_matrix[i][k] + dist_matrix[k][j] < dist_matrix[i][j]
|
175
|
+
dist_matrix [i][j] = dist_matrix[i][k] + dist_matrix[k][j]
|
176
|
+
end
|
177
|
+
}
|
178
|
+
}
|
179
|
+
}
|
180
|
+
return dist_matrix
|
181
|
+
end
|
182
|
+
|
183
|
+
|
184
|
+
# returns a list of nodes (graph) and a hash for finding
|
185
|
+
# nodes by their label (these need to be unique!)
|
186
|
+
def DAG::read_graph_from_json fn, semiring=RealSemiring.new
|
187
|
+
graph = []
|
188
|
+
nodes_by_label = {}
|
189
|
+
h = JSON.parse File.new(fn).read
|
190
|
+
h['nodes'].each { |i|
|
191
|
+
n = DAG::Node.new i['label']
|
192
|
+
graph << n
|
193
|
+
nodes_by_label[n.label] = n
|
194
|
+
}
|
195
|
+
h['edges'].each { |i|
|
196
|
+
n = nodes_by_label[i['tail']]
|
197
|
+
a = n.add_edge(nodes_by_label[i['head']], semiring.convert.call(i['weight'].to_f))
|
198
|
+
nodes_by_label[i['head']].incoming << a
|
199
|
+
}
|
200
|
+
return graph, nodes_by_label
|
201
|
+
end
|
202
|
+
|
203
|
+
|
204
|
+
end #module
|
205
|
+
|
@@ -0,0 +1,88 @@
|
|
1
|
+
require 'zlib'
|
2
|
+
|
3
|
+
|
4
|
+
class ReadFile
|
5
|
+
|
6
|
+
def initialize fn, encoding='utf-8'
|
7
|
+
if fn.split('.').last == 'gz'
|
8
|
+
@f = Zlib::GzipReader.new(File.new(fn, 'rb'), :external_encoding=>encoding)
|
9
|
+
elsif fn == '-'
|
10
|
+
@f = STDIN
|
11
|
+
STDIN.set_encoding encoding
|
12
|
+
else
|
13
|
+
@f = File.new fn, 'r'
|
14
|
+
@f.set_encoding encoding
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def gets
|
19
|
+
@f.gets { |line| yield line }
|
20
|
+
end
|
21
|
+
|
22
|
+
def readlines
|
23
|
+
@f.readlines
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.readlines fn, encoding='utf-8'
|
27
|
+
f = ReadFile.new fn, encoding
|
28
|
+
r = f.readlines
|
29
|
+
f.close
|
30
|
+
return r
|
31
|
+
end
|
32
|
+
|
33
|
+
def readlines_strip
|
34
|
+
self.readlines.map{ |i| i.strip }
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.readlines_strip fn, encoding='utf-8'
|
38
|
+
f = ReadFile.new fn, encoding
|
39
|
+
r = f.readlines_strip
|
40
|
+
f.close
|
41
|
+
return r
|
42
|
+
end
|
43
|
+
|
44
|
+
def read
|
45
|
+
@f.read
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.read fn, encoding='utf-8'
|
49
|
+
f = ReadFile.new fn, encoding
|
50
|
+
r = f.read
|
51
|
+
f.close
|
52
|
+
return r
|
53
|
+
end
|
54
|
+
|
55
|
+
def close
|
56
|
+
@f.close if @f!=STDIN
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
class WriteFile
|
61
|
+
|
62
|
+
def initialize fn, encoding='utf-8'
|
63
|
+
if fn.split('.').last == 'gz'
|
64
|
+
@f = Zlib::GzipWriter.new(File.new(fn, 'wb+'), :external_encoding=>encoding)
|
65
|
+
elsif fn == '-'
|
66
|
+
@f = STDOUT
|
67
|
+
STDOUT.set_encoding encoding
|
68
|
+
else
|
69
|
+
@f = File.new fn, 'w+'
|
70
|
+
@f.set_encoding encoding
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def write s
|
75
|
+
@f.write s
|
76
|
+
end
|
77
|
+
|
78
|
+
def self.write s, fn, encoding='utf-8'
|
79
|
+
f = WriteFile.new fn, encoding
|
80
|
+
f.write s
|
81
|
+
f.close
|
82
|
+
end
|
83
|
+
|
84
|
+
def close
|
85
|
+
@f.close if @f!=STDIN
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
data/lib/zipf/grammar.rb
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
module Grammar
|
2
|
+
|
3
|
+
|
4
|
+
class T
|
5
|
+
attr_accessor :word
|
6
|
+
|
7
|
+
def initialize word
|
8
|
+
@word = word
|
9
|
+
end
|
10
|
+
|
11
|
+
def to_s
|
12
|
+
"T<#{@word}>"
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class NT
|
17
|
+
attr_accessor :symbol, :index, :span
|
18
|
+
|
19
|
+
def initialize symbol, index=0
|
20
|
+
@symbol = symbol
|
21
|
+
@index = index
|
22
|
+
@span = Span.new
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_s
|
26
|
+
"NT(#{@span.left},#{@span.right})<#{@symbol},#{@index}>"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
class Rule
|
31
|
+
attr_accessor :lhs, :rhs, :e
|
32
|
+
|
33
|
+
def initialize lhs=nil, rhs=[], e=''
|
34
|
+
@lhs = lhs
|
35
|
+
@rhs = rhs
|
36
|
+
@e = e
|
37
|
+
end
|
38
|
+
|
39
|
+
def to_s
|
40
|
+
"#{lhs} -> #{rhs.map{ |i| i.to_s }.join ' '} [arity=#{arity}] ||| #{@e}"
|
41
|
+
end
|
42
|
+
|
43
|
+
def arity
|
44
|
+
rhs.select { |i| i.class == NT }.size
|
45
|
+
end
|
46
|
+
|
47
|
+
def from_s s
|
48
|
+
_ = splitpipe s, 3
|
49
|
+
@lhs = NT.new _[0].strip.gsub!(/(\[|\])/, "")
|
50
|
+
_[1].split.each { |x|
|
51
|
+
x.strip!
|
52
|
+
if x[0]=='[' && x[x.size-1] == ']'
|
53
|
+
@rhs << NT.new(x.gsub!(/(\[|\])/, "").split(',')[0])
|
54
|
+
else
|
55
|
+
@rhs << T.new(x)
|
56
|
+
end
|
57
|
+
}
|
58
|
+
@e = _[2]
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.from_s s
|
62
|
+
r = self.new
|
63
|
+
r.from_s s
|
64
|
+
return r
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
class Span
|
69
|
+
attr_accessor :left, :right
|
70
|
+
|
71
|
+
def initialize left=nil, right=nil
|
72
|
+
@left = left
|
73
|
+
@right = right
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
class Grammar
|
78
|
+
attr_accessor :rules, :startn, :startt, :flat
|
79
|
+
|
80
|
+
def initialize fn
|
81
|
+
@rules = []; @startn = []; @startt = [] ;@flat = []
|
82
|
+
ReadFile.readlines_strip(fn).each_with_index { |s,i|
|
83
|
+
STDERR.write '.'; STDERR.write " #{i+1}\n" if (i+1)%80==0
|
84
|
+
@rules << Rule.from_s(s)
|
85
|
+
if @rules.last.rhs.first.class == NT
|
86
|
+
@startn << @rules.last
|
87
|
+
else
|
88
|
+
if rules.last.arity == 0
|
89
|
+
@flat << @rules.last
|
90
|
+
else
|
91
|
+
@startt << @rules.last
|
92
|
+
end
|
93
|
+
end
|
94
|
+
}
|
95
|
+
STDERR.write "\n"
|
96
|
+
end
|
97
|
+
|
98
|
+
def to_s
|
99
|
+
s = ''
|
100
|
+
@rules.each { |r| s += r.to_s+"\n" }
|
101
|
+
return s
|
102
|
+
end
|
103
|
+
|
104
|
+
def add_glue_rules
|
105
|
+
@rules.map { |r| r.lhs.symbol }.select { |s| s != 'S' }.uniq.each { |symbol|
|
106
|
+
@rules << Rule.new(NT.new('S'), [NT.new(symbol)])
|
107
|
+
@startn << @rules.last
|
108
|
+
@rules << Rule.new(NT.new('S'), [NT.new('S'), NT.new('X')])
|
109
|
+
@startn << @rules.last
|
110
|
+
}
|
111
|
+
end
|
112
|
+
|
113
|
+
def add_pass_through_rules s
|
114
|
+
s.each { |word|
|
115
|
+
@rules << Rule.new(NT.new('X'), [T.new(word)])
|
116
|
+
@flat << @rules.last
|
117
|
+
}
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
|
122
|
+
end #module
|
123
|
+
|
data/lib/zipf/hg.rb
ADDED
@@ -0,0 +1,173 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative 'semirings'
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
|
7
|
+
module HG
|
8
|
+
|
9
|
+
|
10
|
+
class HG::Node
|
11
|
+
attr_accessor :label, :cat, :outgoing, :incoming, :score
|
12
|
+
|
13
|
+
def initialize label=nil, cat=nil, outgoing=[], incoming=[], score=nil
|
14
|
+
@label = label
|
15
|
+
@cat = cat
|
16
|
+
@outgoing = outgoing
|
17
|
+
@incoming = incoming
|
18
|
+
@score = nil
|
19
|
+
end
|
20
|
+
|
21
|
+
def to_s
|
22
|
+
"Node<label:\"#{@label}\", cat:\"#{@cat}\", outgoing:#{@outgoing.size}, incoming:#{@incoming.size}>"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
class HG::Hypergraph
|
27
|
+
attr_accessor :nodes, :edges
|
28
|
+
|
29
|
+
def initialize nodes=[], edges=[]
|
30
|
+
@nodes = nodes
|
31
|
+
@edges = edges
|
32
|
+
end
|
33
|
+
|
34
|
+
def arity
|
35
|
+
@edges.map { |e| e.arity }.max
|
36
|
+
end
|
37
|
+
|
38
|
+
def to_s
|
39
|
+
"Hypergraph<nodes:[#{@nodes.to_s}], edges:[#{@edges.to_s}], arity:#{arity}>"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
class HG::Hyperedge
|
44
|
+
attr_accessor :head, :tails, :weight, :f, :mark, :rule, :left, :right
|
45
|
+
|
46
|
+
def initialize head=nil, tails=[], weight=0.0, f={}
|
47
|
+
@head = head
|
48
|
+
@tails = tails
|
49
|
+
@weight = weight
|
50
|
+
@f = f
|
51
|
+
@mark = 0
|
52
|
+
end
|
53
|
+
|
54
|
+
def arity
|
55
|
+
return @tails.size
|
56
|
+
end
|
57
|
+
|
58
|
+
def marked?
|
59
|
+
arity == @mark
|
60
|
+
end
|
61
|
+
|
62
|
+
def to_s
|
63
|
+
"Hyperedge<head:\"#{@head.label}\", \"tails:#{@tails.map{|n|n.label}}, arity:#{arity}, weight:#{@weight}, f:#{f.to_s}, mark:#{@mark}>"
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def HG::topological_sort nodes
|
68
|
+
sorted = []
|
69
|
+
s = nodes.reject { |n| !n.incoming.empty? }
|
70
|
+
while !s.empty?
|
71
|
+
sorted << s.shift
|
72
|
+
sorted.last.outgoing.each { |e|
|
73
|
+
next if e.marked?
|
74
|
+
e.mark += 1
|
75
|
+
s << e.head if e.head.incoming.reject{ |f| f.mark==f.arity }.empty?
|
76
|
+
}
|
77
|
+
end
|
78
|
+
return sorted
|
79
|
+
end
|
80
|
+
|
81
|
+
def HG::init nodes, semiring, root
|
82
|
+
nodes.each { |n| n.score=semiring.null }
|
83
|
+
root.score = semiring.one
|
84
|
+
end
|
85
|
+
|
86
|
+
def HG::viterbi hypergraph, root, semiring=ViterbiSemiring.new
|
87
|
+
toposorted = topological_sort hypergraph.nodes
|
88
|
+
init toposorted, semiring, root
|
89
|
+
toposorted.each { |n|
|
90
|
+
n.incoming.each { |e|
|
91
|
+
s = semiring.one
|
92
|
+
e.tails.each { |m|
|
93
|
+
s = semiring.multiply.call(s, m.score)
|
94
|
+
}
|
95
|
+
n.score = semiring.add.call(n.score, semiring.multiply.call(s, e.weight))
|
96
|
+
}
|
97
|
+
}
|
98
|
+
end
|
99
|
+
|
100
|
+
def HG::viterbi_path hypergraph, root, semiring=ViterbiSemiring.new
|
101
|
+
toposorted = topological_sort hypergraph.nodes
|
102
|
+
init toposorted, semiring, root
|
103
|
+
best_path = []
|
104
|
+
toposorted.each { |n|
|
105
|
+
best_edge = nil
|
106
|
+
n.incoming.each { |e|
|
107
|
+
s = semiring.one
|
108
|
+
e.tails.each { |m|
|
109
|
+
s = semiring.multiply.call(s, m.score)
|
110
|
+
}
|
111
|
+
if n.score < semiring.multiply.call(s, e.weight) # ViterbiSemiring add
|
112
|
+
best_edge = e
|
113
|
+
end
|
114
|
+
n.score = semiring.add.call(n.score, semiring.multiply.call(s, e.weight))
|
115
|
+
}
|
116
|
+
best_path << best_edge
|
117
|
+
}
|
118
|
+
return best_path, toposorted.last.score
|
119
|
+
end
|
120
|
+
|
121
|
+
def HG::read_hypergraph_from_json fn, semiring=RealSemiring.new, log_weights=false
|
122
|
+
nodes = []
|
123
|
+
edges = []
|
124
|
+
nodes_by_label = {}
|
125
|
+
nodes_by_index = []
|
126
|
+
h = JSON.parse File.new(fn).read
|
127
|
+
w = SparseVector.from_h h['weights']
|
128
|
+
h['nodes'].each { |i|
|
129
|
+
n = Node.new i['label'], i['cat']
|
130
|
+
nodes << n
|
131
|
+
nodes_by_label[n.label] = n
|
132
|
+
nodes_by_index << n
|
133
|
+
}
|
134
|
+
h['edges'].each { |i|
|
135
|
+
e = Hyperedge.new(nodes_by_label[i['head']], \
|
136
|
+
i['tails'].map{|j| nodes_by_label[j]}.to_a, \
|
137
|
+
semiring.convert.call(i['weight'].to_f), \
|
138
|
+
{})
|
139
|
+
e.f = SparseVector.from_h i['f']
|
140
|
+
if log_weights
|
141
|
+
e.weight = Math.exp(w.dot(e.f))
|
142
|
+
else
|
143
|
+
e.weight = w.dot(e.f)
|
144
|
+
end
|
145
|
+
e.tails.each { |m|
|
146
|
+
m.outgoing << e
|
147
|
+
}
|
148
|
+
e.head.incoming << e
|
149
|
+
edges << e
|
150
|
+
}
|
151
|
+
return Hypergraph.new(nodes, edges), nodes_by_label, nodes_by_index
|
152
|
+
end
|
153
|
+
|
154
|
+
def HG::all_paths hypergraph, root, semiring=ViterbiSemiring.new
|
155
|
+
toposorted = topological_sort hypergraph.nodes
|
156
|
+
paths = [[]]
|
157
|
+
toposorted.each { |n|
|
158
|
+
next if n.incoming.empty?
|
159
|
+
new_paths = []
|
160
|
+
while !paths.empty?
|
161
|
+
p = paths.pop
|
162
|
+
n.incoming.each { |e|
|
163
|
+
new_paths << p+[e]
|
164
|
+
}
|
165
|
+
end
|
166
|
+
paths = new_paths
|
167
|
+
}
|
168
|
+
return paths
|
169
|
+
end
|
170
|
+
|
171
|
+
|
172
|
+
end #module
|
173
|
+
|
data/lib/zipf/misc.rb
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
require 'timeout'
|
2
|
+
|
3
|
+
|
4
|
+
class Array
|
5
|
+
def max_index
|
6
|
+
self.index(self.max)
|
7
|
+
end
|
8
|
+
|
9
|
+
def is_subset_of? other
|
10
|
+
self.each { |i|
|
11
|
+
if other.include? i
|
12
|
+
return false
|
13
|
+
end
|
14
|
+
}
|
15
|
+
return true
|
16
|
+
end
|
17
|
+
|
18
|
+
def sum
|
19
|
+
self.inject(:+)
|
20
|
+
end
|
21
|
+
|
22
|
+
def mean
|
23
|
+
self.sum.to_f/self.size
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
class String
|
28
|
+
|
29
|
+
def downcase?
|
30
|
+
self[/[[:lower:]]/]
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
class PriorityQueue
|
35
|
+
# This assumes that elements in the queue
|
36
|
+
# have a numerical member named 'score'.
|
37
|
+
|
38
|
+
def initialize a=Array.new
|
39
|
+
@queue = Array.new a
|
40
|
+
sort!
|
41
|
+
end
|
42
|
+
|
43
|
+
def sort!
|
44
|
+
@queue.sort_by! { |i| -i.score }
|
45
|
+
end
|
46
|
+
|
47
|
+
def pop
|
48
|
+
@queue.pop
|
49
|
+
end
|
50
|
+
|
51
|
+
def push i
|
52
|
+
@queue << i
|
53
|
+
sort!
|
54
|
+
end
|
55
|
+
|
56
|
+
def empty?
|
57
|
+
@queue.empty?
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def spawn_with_timeout cmd, t=4, ignore_fail=false, debug=false
|
62
|
+
STDERR.write cmd+"\n" if debug
|
63
|
+
pipe_in, pipe_out = IO.pipe
|
64
|
+
pid = Process.spawn(cmd, :out => pipe_out)
|
65
|
+
begin
|
66
|
+
Timeout.timeout(t) { Process.wait pid }
|
67
|
+
rescue Timeout::Error
|
68
|
+
Process.kill('TERM', pid) if !ignore_fail
|
69
|
+
end
|
70
|
+
pipe_out.close
|
71
|
+
return pipe_in.read
|
72
|
+
end
|
73
|
+
|
74
|
+
def read_phrase_table fn
|
75
|
+
table = {}
|
76
|
+
f = ReadFile.new fn
|
77
|
+
while raw_rule = f.gets
|
78
|
+
french, english, features = splitpipe(raw_rule)
|
79
|
+
feature_map = SparseVector.from_kv features
|
80
|
+
if table.has_key? french
|
81
|
+
table[french] << [english, feature_map ]
|
82
|
+
else
|
83
|
+
table[french] = [[english, feature_map]]
|
84
|
+
end
|
85
|
+
end
|
86
|
+
f.close
|
87
|
+
return table
|
88
|
+
end
|
89
|
+
|
90
|
+
def cdec_kbest cdec_bin, input, ini, weights, k, unique=true
|
91
|
+
require 'open3'
|
92
|
+
cmd = "echo \"#{input}\" | #{cdec_bin} -c #{ini} -w #{weights} -k #{k}"
|
93
|
+
cmd += " -r" if unique
|
94
|
+
o,_ = Open3.capture2 "#{cmd} 2>/dev/null"
|
95
|
+
a = []; j = -1
|
96
|
+
o.split("\n").map{ |i| j+=1; t=Translation.new; t.from_s(i, false, j); a << t }
|
97
|
+
return a
|
98
|
+
end
|
99
|
+
|
100
|
+
def read_config fn
|
101
|
+
f = ReadFile.new fn
|
102
|
+
cfg = {}
|
103
|
+
while line = f.gets
|
104
|
+
line.strip!
|
105
|
+
next if /^\s*$/.match line
|
106
|
+
next if line[0]=='#'
|
107
|
+
content = line.split('#', 2).first
|
108
|
+
k, v = content.split(/\s*=\s*/, 2)
|
109
|
+
k.strip!; v.strip!
|
110
|
+
cfg[k] = v
|
111
|
+
end
|
112
|
+
return cfg
|
113
|
+
end
|
114
|
+
|
@@ -0,0 +1,81 @@
|
|
1
|
+
# Semirings for directed acyclic graphs (dags) (also directed hypergraphs),
|
2
|
+
# as described in:
|
3
|
+
# 'Dynamic Programming Algorithms in
|
4
|
+
# Semiring and Hypergraph Frameworks' (Liang Huang)
|
5
|
+
#
|
6
|
+
|
7
|
+
class Semiring
|
8
|
+
attr_accessor :add, :multiply, :one, :null, :convert
|
9
|
+
end
|
10
|
+
|
11
|
+
class BooleanSemiring < Semiring
|
12
|
+
def initialize
|
13
|
+
@add = Proc.new { |a,b| a||b }
|
14
|
+
@multiply = Proc.new { |a,b| a&&b }
|
15
|
+
@one = true
|
16
|
+
@null = false
|
17
|
+
@convert = Proc.new { |v| true && v!=0 }
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class ViterbiSemiring < Semiring
|
22
|
+
def initialize
|
23
|
+
@add = Proc.new { |a,b| [a,b].max }
|
24
|
+
@multiply = Proc.new { |a,b| a*b }
|
25
|
+
@one = 1.0
|
26
|
+
@null = 0.0
|
27
|
+
@convert = Proc.new { |v| v }
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
class ViterbiLogSemiring < Semiring
|
32
|
+
def initialize
|
33
|
+
@add = Proc.new { |a,b| [a,b].max }
|
34
|
+
@multiply = Proc.new { |a,b| a+b }
|
35
|
+
@one = 0.0
|
36
|
+
@null = -1.0/0.0
|
37
|
+
@convert = Proc.new { |v| v }
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
class InsideSemiring < Semiring
|
42
|
+
def initialize
|
43
|
+
@add = Proc.new { |a,b| a+b }
|
44
|
+
@multiply = Proc.new { |a,b| a*b }
|
45
|
+
@one = 1.0
|
46
|
+
@null = 0.0
|
47
|
+
@convert = Proc.new { |v| v }
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
class RealSemiring < Semiring
|
52
|
+
def initialize
|
53
|
+
@add = Proc.new { |a,b| [a,b].min }
|
54
|
+
@multiply = Proc.new { |a,b| a+b }
|
55
|
+
@one = 0.0
|
56
|
+
@null = 1.0/0.0
|
57
|
+
@convert = Proc.new { |v| v }
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# for longest/worst paths
|
62
|
+
class RealxSemiring < Semiring
|
63
|
+
def initialize
|
64
|
+
@add = Proc.new { |a,b| [a,b].max }
|
65
|
+
@multiply = Proc.new { |a,b| a+b }
|
66
|
+
@one = -1.0/0.0
|
67
|
+
@null = 0.0
|
68
|
+
@convert = Proc.new { |v| v }
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
class CountingSemiring < Semiring
|
73
|
+
def initialize
|
74
|
+
@add = Proc.new { |a,b| a+b }
|
75
|
+
@multiply = Proc.new { |a,b| a*b }
|
76
|
+
@one = 1.0
|
77
|
+
@null = 0.0
|
78
|
+
@convert = Proc.new { |v| if v!=0 then 1 else 0 end }
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
@@ -0,0 +1,22 @@
|
|
1
|
+
def tokenize s
|
2
|
+
s.strip.split
|
3
|
+
end
|
4
|
+
|
5
|
+
def ngrams(s, n, fix=false)
|
6
|
+
a = tokenize s
|
7
|
+
a.each_with_index { |tok, i|
|
8
|
+
tok.strip!
|
9
|
+
0.upto([n-1, a.size-i-1].min) { |m|
|
10
|
+
yield a[i..i+m] if !fix||(fix&&a[i..i+m].size==n)
|
11
|
+
}
|
12
|
+
}
|
13
|
+
end
|
14
|
+
|
15
|
+
def bag_of_words s, stopwords=[]
|
16
|
+
s.strip.split.uniq.sort.reject{ |w| stopwords.include? w }
|
17
|
+
end
|
18
|
+
|
19
|
+
def splitpipe s, n=3
|
20
|
+
s.strip.split("|"*n)
|
21
|
+
end
|
22
|
+
|
data/lib/zipf/tfidf.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
module TFIDF
|
2
|
+
|
3
|
+
|
4
|
+
# returns key='raw frequency' for an
|
5
|
+
# array-like object
|
6
|
+
def TFIDF::tf array, stopwords=[]
|
7
|
+
v = {}; v.default = 0
|
8
|
+
array.uniq.each { |i|
|
9
|
+
next if stopwords.include? i
|
10
|
+
v[i] = array.count(i).to_f
|
11
|
+
}
|
12
|
+
return v
|
13
|
+
end
|
14
|
+
|
15
|
+
# smoothes raw frequencies of tf() in-place
|
16
|
+
# a is a smoothing term
|
17
|
+
def TFIDF::ntf hash, a=0.4
|
18
|
+
max = hash.values.max.to_f
|
19
|
+
hash.each_pair { |k,v|
|
20
|
+
hash[k] = a + (1-a)*(v/max)
|
21
|
+
}
|
22
|
+
end
|
23
|
+
|
24
|
+
# returns idf value for each word in a vocabulary
|
25
|
+
def TFIDF::idf list_of_hashes
|
26
|
+
vocab = list_of_hashes.values.flatten.uniq
|
27
|
+
n = list_of_hashes.size.to_f
|
28
|
+
idf = {}
|
29
|
+
vocab.each { |i|
|
30
|
+
df = list_of_hashes.values.flatten.count i
|
31
|
+
idf[i] = Math.log(n/df)
|
32
|
+
}
|
33
|
+
return idf
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
end #module
|
38
|
+
|
metadata
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: zipf
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Patrick Simianer
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-06-16 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: NLP related tools and classes
|
14
|
+
email: p@simianer.de
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/zipf.rb
|
20
|
+
- lib/zipf/stringutil.rb
|
21
|
+
- lib/zipf/misc.rb
|
22
|
+
- lib/zipf/grammar.rb
|
23
|
+
- lib/zipf/hg.rb
|
24
|
+
- lib/zipf/fileutil.rb
|
25
|
+
- lib/zipf/semirings.rb
|
26
|
+
- lib/zipf/dag.rb
|
27
|
+
- lib/zipf/SparseVector.rb
|
28
|
+
- lib/zipf/tfidf.rb
|
29
|
+
- lib/zipf/bleu.rb
|
30
|
+
- lib/zipf/Translation.rb
|
31
|
+
homepage: http://simianer.de
|
32
|
+
licenses:
|
33
|
+
- MIT
|
34
|
+
metadata: {}
|
35
|
+
post_install_message:
|
36
|
+
rdoc_options: []
|
37
|
+
require_paths:
|
38
|
+
- lib
|
39
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - '>='
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: '0'
|
49
|
+
requirements: []
|
50
|
+
rubyforge_project:
|
51
|
+
rubygems_version: 2.0.3
|
52
|
+
signing_key:
|
53
|
+
specification_version: 4
|
54
|
+
summary: zipf
|
55
|
+
test_files: []
|