zipf 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e8021bb7a07d98332028ec75ff1c3bf53149cab3
4
+ data.tar.gz: 958c53844c7e0b1b76b44bcc26fe60736449a1cd
5
+ SHA512:
6
+ metadata.gz: c444514cec3f6154c9011db7aac92b579e046dea3c88e24781db728f46cb67c1e789c50007edbc4dfe202c448a86b158f170ac3a6baf5cd4cae4ff5fd422b5c7
7
+ data.tar.gz: 43c54fa8adf44ef26d0894bb00a8de9e8ae30cc79efacbf40d20f7c8bc116a6cbecd2f641ccc23c582ddb4e8dc69627958d7b79c57f6fd1cd5f250fed3df6c50
data/lib/zipf.rb ADDED
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'zipf/stringutil'
4
+ require 'zipf/fileutil'
5
+ require 'zipf/SparseVector'
6
+ require 'zipf/tfidf'
7
+ require 'zipf/Translation'
8
+ require 'zipf/dag'
9
+ require 'zipf/semirings'
10
+ require 'zipf/bleu'
11
+ require 'zipf/misc'
12
+ require 'zipf/hg'
13
+ require 'zipf/grammar'
14
+
15
+ STDIN.set_encoding 'utf-8'
16
+ STDOUT.set_encoding 'utf-8'
17
+ STDERR.set_encoding 'utf-8'
18
+
@@ -0,0 +1,172 @@
1
+ class SparseVector < Hash
2
+
3
+ def initialize arg=nil
4
+ super
5
+ self.default = 0
6
+ if arg.is_a? Array
7
+ from_a arg
8
+ end
9
+ end
10
+
11
+ def from_a a
12
+ a.each_with_index { |i,j| self[j] = i }
13
+ end
14
+
15
+ def self.from_a a
16
+ v = SparseVector.new
17
+ v.from_a a
18
+ return v
19
+ end
20
+
21
+ def from_h h
22
+ h.each_pair { |k,v| self[k] = v }
23
+ end
24
+
25
+ def self.from_h h
26
+ v = SparseVector.new
27
+ v.from_h h
28
+ return v
29
+ end
30
+
31
+ def from_s s
32
+ from_h eval(s)
33
+ end
34
+
35
+ def self.from_s s
36
+ v = SparseVector.new
37
+ v.from_s s
38
+ return v
39
+ end
40
+
41
+ def to_kv sep='=', join=' '
42
+ a = []
43
+ self.each_pair { |k,v|
44
+ a << "#{k}#{sep}#{v}"
45
+ }
46
+ return a.join join
47
+ end
48
+
49
+ def from_kv s
50
+ s.split.each { |i|
51
+ k,v = i.split('=')
52
+ self[k] = v.to_f
53
+ }
54
+ end
55
+
56
+ def self.from_kv s
57
+ v = SparseVector.new
58
+ v.from_kv s
59
+ return v
60
+ end
61
+
62
+ def from_file fn, sep='='
63
+ f = ReadFile.new(fn)
64
+ while line = f.gets
65
+ key, value = line.strip.split sep
66
+ value = value.to_f
67
+ self[key] = value
68
+ end
69
+ end
70
+
71
+ def self.from_file fn, sep='='
72
+ v = SparseVector.new
73
+ v.from_file fn, sep
74
+ return v
75
+ end
76
+
77
+ def join_keys other
78
+ self.keys + other.keys
79
+ end
80
+
81
+ def sum
82
+ self.values.inject(:+)
83
+ end
84
+
85
+ def approx_eql? other, p=10**-10
86
+ return false if !other
87
+ return false if other.size!=self.size
88
+ return false if other.keys.sort!=self.keys.sort
89
+ self.keys.each { |k|
90
+ return false if (self[k]-other[k]).abs>p
91
+ }
92
+ return true
93
+ end
94
+
95
+ def average
96
+ self.sum/self.size.to_f
97
+ end
98
+
99
+ def variance
100
+ avg = self.average
101
+ var = 0.0
102
+ self.values.each { |i| var += (avg - i)**2 }
103
+ return var
104
+ end
105
+
106
+ def stddev
107
+ Math.sqrt self.variance
108
+ end
109
+
110
+ def dot other
111
+ sum = 0.0
112
+ self.each_pair { |k,v| sum += v * other[k] }
113
+ return sum
114
+ end
115
+
116
+ def zeros n
117
+ (0).upto(n-1) { |i| self[i] = 0.0 }
118
+ end
119
+
120
+ def magnitude
121
+ Math.sqrt self.values.inject { |sum,i| sum+i**2 }
122
+ end
123
+
124
+ def cosinus_sim other
125
+ self.dot(other)/(self.magnitude*other.magnitude)
126
+ end
127
+
128
+ def euclidian_dist other
129
+ dims = [self.keys, other.keys].flatten.uniq
130
+ sum = 0.0
131
+ dims.each { |d| sum += (self[d] - other[d])**2 }
132
+ return Math.sqrt(sum)
133
+ end
134
+
135
+ def + other
136
+ new = SparseVector.new
137
+ join_keys(other).each { |k|
138
+ new[k] = self[k]+other[k]
139
+ }
140
+ return new
141
+ end
142
+
143
+ def - other
144
+ new = SparseVector.new
145
+ join_keys(other).each { |k|
146
+ new[k] = self[k]-other[k]
147
+ }
148
+ return new
149
+ end
150
+
151
+ def * scalar
152
+ raise ArgumentError, "Arg is not numeric #{scalar}" unless scalar.is_a? Numeric
153
+ new = SparseVector.new
154
+ self.keys.each { |k|
155
+ new[k] = self[k] * scalar
156
+ }
157
+ return new
158
+ end
159
+
160
+ def self.mean a
161
+ mean = SparseVector.new
162
+ a.each { |i|
163
+ i.each_pair { |k,v|
164
+ mean[k] += v
165
+ }
166
+ }
167
+ n = a.size.to_f
168
+ mean.each_pair { |k,v| mean[k] = v/n }
169
+ return mean
170
+ end
171
+ end
172
+
@@ -0,0 +1,72 @@
1
+ class Translation
2
+ attr_accessor :id, :s, :raw, :f, :scores, :rank
3
+
4
+ def initialize id=nil, raw=nil, s=nil, f=nil, scores={}, rank=nil
5
+ @id = id
6
+ @raw = raw
7
+ @s = s
8
+ @f = f
9
+ @scores = scores
10
+ @rank = rank
11
+ end
12
+
13
+ def from_s t, strip_alignment=true, rank=nil
14
+ id, raw, features, score = splitpipe(t, 3)
15
+ raw.strip!
16
+ @raw = raw
17
+ if strip_alignment # the way moses does it
18
+ @s = @raw.gsub(/\s*\|\d+-\d+\||\|-?\d+\|\s*/, ' ').gsub(/\s+/, ' ')
19
+ @s.strip!
20
+ else
21
+ @s = raw
22
+ end
23
+ @id = id.to_i
24
+ @f = SparseVector.from_kv features
25
+ @scores[:decoder] = score.to_f
26
+ @rank = rank
27
+ end
28
+
29
+ def self.from_s s
30
+ t = self.new
31
+ t.from_s s
32
+ return t
33
+ end
34
+
35
+ def to_s include_features=true
36
+ [@id, @s, @f.to_kv('=', ' '), @scores[:decoder]].join(' ||| ') if include_features
37
+ [@id, @s, @scores[:decoder]].join(' ||| ') if !include_features
38
+ end
39
+
40
+ def to_s2
41
+ [@rank, @s, @score, @scores.to_s].join ' ||| '
42
+ end
43
+ end
44
+
45
+ def read_kbest_lists fn, translation_type=Translation
46
+ kbest_lists = []
47
+ cur = []
48
+ f = ReadFile.new fn
49
+ prev = -1
50
+ c = 0
51
+ id = 0
52
+ while line = f.gets
53
+ t = translation_type.new
54
+ t.from_s line
55
+ c = splitpipe(line)[0].to_i
56
+ if c != prev
57
+ if cur.size > 0
58
+ kbest_lists << cur
59
+ cur = []
60
+ end
61
+ prev = c
62
+ id = 0
63
+ end
64
+ t.id = id
65
+ cur << t
66
+ id += 1
67
+ end
68
+ kbest_lists << cur # last one
69
+ f.close
70
+ return kbest_lists
71
+ end
72
+
data/lib/zipf/bleu.rb ADDED
@@ -0,0 +1,130 @@
1
+ module BLEU
2
+
3
+
4
+ class BLEU::NgramCounts
5
+ attr_accessor :sum, :clipped, :ref_len, :hyp_len, :n
6
+
7
+ def initialize(n)
8
+ @n = 0
9
+ @sum = []
10
+ @clipped = []
11
+ @ref_len = 0.0
12
+ @hyp_len = 0.0
13
+ grow(n)
14
+ end
15
+
16
+ def grow(n)
17
+ (n-@n).times {
18
+ @sum << 0.0
19
+ @clipped << 0.0
20
+ }
21
+ @n = n
22
+ end
23
+
24
+ def plus_eq(other)
25
+ if other.n > @n then grow(other.n) end
26
+ 0.upto(other.n-1) { |m|
27
+ @sum[m] += other.sum[m]
28
+ @clipped[m] += other.clipped[m]
29
+ }
30
+ @ref_len += other.ref_len
31
+ @hyp_len += other.hyp_len
32
+ end
33
+
34
+ def to_s
35
+ return "n=#{n} sum=#{sum} clipped=#{clipped} ref_len=#{ref_len} hyp_len=#{hyp_len}"
36
+ end
37
+ end
38
+
39
+ class BLEU::Ngrams
40
+ def initialize
41
+ @h_ = {}
42
+ @h_.default = 0
43
+ end
44
+
45
+ def add(k)
46
+ if k.class == Array then k = k.join ' ' end
47
+ @h_[k] += 1
48
+ end
49
+
50
+ def get_count(k)
51
+ if k.class == Array then k = k.join ' ' end
52
+ return @h_[k]
53
+ end
54
+
55
+ def each
56
+ @h_.each_pair { |k,v|
57
+ yield k.split, v
58
+ }
59
+ end
60
+
61
+ def to_s
62
+ @h_.to_s
63
+ end
64
+ end
65
+
66
+ def BLEU::get_counts hypothesis, reference, n, times=1
67
+ p = NgramCounts.new n
68
+ r = Ngrams.new
69
+ ngrams(reference, n) { |ng| r.add ng }
70
+ h = Ngrams.new
71
+ ngrams(hypothesis, n) { |ng| h.add ng }
72
+ h.each { |ng,count|
73
+ sz = ng.size-1
74
+ p.sum[sz] += count * times
75
+ p.clipped[sz] += [r.get_count(ng), count].min * times
76
+ }
77
+ p.ref_len = tokenize(reference.strip).size * times
78
+ p.hyp_len = tokenize(hypothesis.strip).size * times
79
+ return p
80
+ end
81
+
82
+ def BLEU::brevity_penalty c, r, smooth=0.0
83
+ return [0.0, 1.0-((r+smooth)/c)].min
84
+ end
85
+
86
+ def BLEU::bleu counts, n, debug=false
87
+ corpus_stats = NgramCounts.new n
88
+ counts.each { |i| corpus_stats.plus_eq i }
89
+ logbleu = 0.0
90
+ 0.upto(n-1) { |m|
91
+ STDERR.write "#{m+1} #{corpus_stats.clipped[m]} / #{corpus_stats.sum[m]}\n" if debug
92
+ return 0.0 if corpus_stats.clipped[m] == 0 or corpus_stats.sum == 0
93
+ logbleu += Math.log(corpus_stats.clipped[m]) - Math.log(corpus_stats.sum[m])
94
+ }
95
+ logbleu /= n
96
+ if debug
97
+ STDERR.write "BP #{brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len)}\n"
98
+ STDERR.write "sum #{Math.exp(sum)}\n"
99
+ end
100
+ logbleu += brevity_penalty corpus_stats.hyp_len, corpus_stats.ref_len
101
+ return Math.exp logbleu
102
+ end
103
+
104
+ def BLEU::hbleu counts, n, debug=false
105
+ (100*bleu(counts, n, debug)).round(3)
106
+ end
107
+
108
+ def BLEU::per_sentence_bleu hypothesis, reference, n=4, smooth=0.0
109
+ h_ng = {}; r_ng = {}
110
+ (1).upto(n) { |i| h_ng[i] = []; r_ng[i] = [] }
111
+ ngrams(hypothesis, n) { |i| h_ng[i.size] << i }
112
+ ngrams(reference, n) { |i| r_ng[i.size] << i }
113
+ m = [n, reference.split.size].min
114
+ add = 0.0
115
+ logbleu = 0.0
116
+ (1).upto(m) { |i|
117
+ counts_clipped = 0
118
+ counts_sum = h_ng[i].size
119
+ h_ng[i].uniq.each { |j| counts_clipped += r_ng[i].count(j) }
120
+ add = 1.0 if i >= 2
121
+ logbleu += Math.log(counts_clipped+add) - Math.log(counts_sum+add);
122
+ }
123
+ logbleu /= m
124
+ logbleu += brevity_penalty hypothesis.strip.split.size, reference.strip.split.size, smooth
125
+ return Math.exp logbleu
126
+ end
127
+
128
+
129
+ end #module
130
+
data/lib/zipf/dag.rb ADDED
@@ -0,0 +1,205 @@
1
+ module DAG
2
+
3
+ require 'json'
4
+
5
+
6
+ class DAG::Node
7
+ attr_accessor :label, :outgoing, :incoming, :score, :mark
8
+
9
+ def initialize label=nil, outgoing=[], incoming=[], score=nil
10
+ @label = label
11
+ @outgoing = outgoing
12
+ @incoming = incoming
13
+ @score = nil
14
+ end
15
+
16
+ def add_edge head, weight=0
17
+ exit if self==head # no self-cycles!
18
+ @outgoing << DAG::Edge.new(self, head, weight)
19
+ return @outgoing.last
20
+ end
21
+
22
+ def to_s
23
+ "DAG::Node<label:#{label}, outgoing:#{outgoing.size}, incoming:#{incoming.size}>"
24
+ end
25
+
26
+ def repr
27
+ "#{to_s} #{@score} out:#{@outgoing} in:[#{@incoming.map{|e| e.to_s}.join ', '}]"
28
+ end
29
+ end
30
+
31
+ class DAG::Edge
32
+ attr_accessor :tail, :head, :weight, :mark
33
+
34
+ def initialize tail=nil, head=nil, weight=0
35
+ @tail = tail
36
+ @head = head
37
+ @weight = weight
38
+ @mark = false # did we already follow this edge? -- for topological sorting
39
+ end
40
+
41
+ def to_s
42
+ s = "DAG::Edge<#{@tail} ->[#{weight}] #{@head}"
43
+ s += " x" if @mark
44
+ s += ">"
45
+ s
46
+ end
47
+ end
48
+
49
+ # depth-first search
50
+ # w/o markings as we do not have cycles
51
+ def DAG::dfs n, target_label
52
+ return n if n.label==target_label # assumes uniq labels!
53
+ stack = n.outgoing.map { |i| i.head }
54
+ while !stack.empty?
55
+ m = stack.pop
56
+ return DAG::dfs m, target_label
57
+ end
58
+ return nil
59
+ end
60
+
61
+ # breadth-first search
62
+ # w/o markings as we do not have cycles
63
+ def DAG::bfs n, target_label
64
+ queue = [n]
65
+ while !queue.empty?
66
+ m = queue.shift
67
+ return m if m.label==target_label
68
+ m.outgoing.each { |e| queue << e.head }
69
+ end
70
+ return nil
71
+ end
72
+
73
+ # topological sort
74
+ def DAG::topological_sort graph
75
+ sorted = []
76
+ s = graph.reject { |n| !n.incoming.empty? }
77
+ while !s.empty?
78
+ sorted << s.shift
79
+ sorted.last.outgoing.each { |e|
80
+ e.mark = true
81
+ s << e.head if e.head.incoming.reject{|f| f.mark}.empty?
82
+ }
83
+ end
84
+ return sorted
85
+ end
86
+
87
+ # initialize graph scores with semiring One
88
+ def DAG::init graph, semiring, source_node
89
+ graph.each {|n| n.score=semiring.null}
90
+ source_node.score = semiring.one
91
+ end
92
+
93
+ # viterbi
94
+ def DAG::viterbi graph, semiring=ViterbiSemiring, source_node
95
+ toposorted = DAG::topological_sort(graph)
96
+ DAG::init(graph, semiring, source_node)
97
+ toposorted.each { |n|
98
+ n.incoming.each { |e|
99
+ # update
100
+ n.score = \
101
+ semiring.add.call(n.score, \
102
+ semiring.multiply.call(e.tail.score, e.weight)
103
+ )
104
+ }
105
+ }
106
+ end
107
+
108
+ # forward viterbi
109
+ def DAG::viterbi_forward graph, semiring=ViterbiSemiring, source_node
110
+ toposorted = DAG::topological_sort(graph)
111
+ DAG::init(graph, semiring, source_node)
112
+ toposorted.each { |n|
113
+ n.outgoing.each { |e|
114
+ e.head.score = \
115
+ semiring.add.call(e.head.score, \
116
+ semiring.multiply.call(n.score, e.weight)
117
+ )
118
+ }
119
+ }
120
+ end
121
+
122
+ # Dijkstra algorithm
123
+ # for A*-search we would need an optimistic estimate of
124
+ # future cost at each node
125
+ def DAG::dijkstra graph, semiring=RealSemiring.new, source_node
126
+ DAG::init(graph, semiring, source_node)
127
+ q = PriorityQueue.new graph
128
+ while !q.empty?
129
+ n = q.pop
130
+ n.outgoing.each { |e|
131
+ e.head.score = \
132
+ semiring.add.call(e.head.score, \
133
+ semiring.multiply.call(n.score, e.weight))
134
+ q.sort!
135
+ }
136
+ end
137
+ end
138
+
139
+ # Bellman-Ford algorithm
140
+ def DAG::bellman_ford(graph, semiring=RealSemiring.new, source_node)
141
+ DAG::init(graph, semiring, source_node)
142
+ edges = []
143
+ graph.each { |n| edges |= n.outgoing }
144
+ # relax edges
145
+ (graph.size-1).times{ |i|
146
+ edges.each { |e|
147
+ e.head.score = \
148
+ semiring.add.call(e.head.score, \
149
+ semiring.multiply.call(e.tail.score, e.weight))
150
+ }
151
+ }
152
+ # we do not allow cycles (negative or positive)
153
+ end
154
+
155
+ # Floyd algorithm
156
+ def DAG::floyd(graph, semiring=nil)
157
+ dist_matrix = []
158
+ graph.each_index { |i|
159
+ dist_matrix << []
160
+ graph.each_index { |j|
161
+ val = 1.0/0.0
162
+ val = 0.0 if i==j
163
+ dist_matrix.last << val
164
+ }
165
+ }
166
+ edges = []
167
+ graph.each { |n| edges |= n.outgoing }
168
+ edges.each { |e|
169
+ dist_matrix[graph.index(e.tail)][graph.index(e.head)] = e.weight
170
+ }
171
+ 0.upto(graph.size-1) { |k|
172
+ 0.upto(graph.size-1) { |i|
173
+ 0.upto(graph.size-1) { |j|
174
+ if dist_matrix[i][k] + dist_matrix[k][j] < dist_matrix[i][j]
175
+ dist_matrix [i][j] = dist_matrix[i][k] + dist_matrix[k][j]
176
+ end
177
+ }
178
+ }
179
+ }
180
+ return dist_matrix
181
+ end
182
+
183
+
184
+ # returns a list of nodes (graph) and a hash for finding
185
+ # nodes by their label (these need to be unique!)
186
+ def DAG::read_graph_from_json fn, semiring=RealSemiring.new
187
+ graph = []
188
+ nodes_by_label = {}
189
+ h = JSON.parse File.new(fn).read
190
+ h['nodes'].each { |i|
191
+ n = DAG::Node.new i['label']
192
+ graph << n
193
+ nodes_by_label[n.label] = n
194
+ }
195
+ h['edges'].each { |i|
196
+ n = nodes_by_label[i['tail']]
197
+ a = n.add_edge(nodes_by_label[i['head']], semiring.convert.call(i['weight'].to_f))
198
+ nodes_by_label[i['head']].incoming << a
199
+ }
200
+ return graph, nodes_by_label
201
+ end
202
+
203
+
204
+ end #module
205
+
@@ -0,0 +1,88 @@
1
+ require 'zlib'
2
+
3
+
4
+ class ReadFile
5
+
6
+ def initialize fn, encoding='utf-8'
7
+ if fn.split('.').last == 'gz'
8
+ @f = Zlib::GzipReader.new(File.new(fn, 'rb'), :external_encoding=>encoding)
9
+ elsif fn == '-'
10
+ @f = STDIN
11
+ STDIN.set_encoding encoding
12
+ else
13
+ @f = File.new fn, 'r'
14
+ @f.set_encoding encoding
15
+ end
16
+ end
17
+
18
+ def gets
19
+ @f.gets { |line| yield line }
20
+ end
21
+
22
+ def readlines
23
+ @f.readlines
24
+ end
25
+
26
+ def self.readlines fn, encoding='utf-8'
27
+ f = ReadFile.new fn, encoding
28
+ r = f.readlines
29
+ f.close
30
+ return r
31
+ end
32
+
33
+ def readlines_strip
34
+ self.readlines.map{ |i| i.strip }
35
+ end
36
+
37
+ def self.readlines_strip fn, encoding='utf-8'
38
+ f = ReadFile.new fn, encoding
39
+ r = f.readlines_strip
40
+ f.close
41
+ return r
42
+ end
43
+
44
+ def read
45
+ @f.read
46
+ end
47
+
48
+ def self.read fn, encoding='utf-8'
49
+ f = ReadFile.new fn, encoding
50
+ r = f.read
51
+ f.close
52
+ return r
53
+ end
54
+
55
+ def close
56
+ @f.close if @f!=STDIN
57
+ end
58
+ end
59
+
60
+ class WriteFile
61
+
62
+ def initialize fn, encoding='utf-8'
63
+ if fn.split('.').last == 'gz'
64
+ @f = Zlib::GzipWriter.new(File.new(fn, 'wb+'), :external_encoding=>encoding)
65
+ elsif fn == '-'
66
+ @f = STDOUT
67
+ STDOUT.set_encoding encoding
68
+ else
69
+ @f = File.new fn, 'w+'
70
+ @f.set_encoding encoding
71
+ end
72
+ end
73
+
74
+ def write s
75
+ @f.write s
76
+ end
77
+
78
+ def self.write s, fn, encoding='utf-8'
79
+ f = WriteFile.new fn, encoding
80
+ f.write s
81
+ f.close
82
+ end
83
+
84
+ def close
85
+ @f.close if @f!=STDIN
86
+ end
87
+ end
88
+
@@ -0,0 +1,123 @@
1
+ module Grammar
2
+
3
+
4
+ class T
5
+ attr_accessor :word
6
+
7
+ def initialize word
8
+ @word = word
9
+ end
10
+
11
+ def to_s
12
+ "T<#{@word}>"
13
+ end
14
+ end
15
+
16
+ class NT
17
+ attr_accessor :symbol, :index, :span
18
+
19
+ def initialize symbol, index=0
20
+ @symbol = symbol
21
+ @index = index
22
+ @span = Span.new
23
+ end
24
+
25
+ def to_s
26
+ "NT(#{@span.left},#{@span.right})<#{@symbol},#{@index}>"
27
+ end
28
+ end
29
+
30
+ class Rule
31
+ attr_accessor :lhs, :rhs, :e
32
+
33
+ def initialize lhs=nil, rhs=[], e=''
34
+ @lhs = lhs
35
+ @rhs = rhs
36
+ @e = e
37
+ end
38
+
39
+ def to_s
40
+ "#{lhs} -> #{rhs.map{ |i| i.to_s }.join ' '} [arity=#{arity}] ||| #{@e}"
41
+ end
42
+
43
+ def arity
44
+ rhs.select { |i| i.class == NT }.size
45
+ end
46
+
47
+ def from_s s
48
+ _ = splitpipe s, 3
49
+ @lhs = NT.new _[0].strip.gsub!(/(\[|\])/, "")
50
+ _[1].split.each { |x|
51
+ x.strip!
52
+ if x[0]=='[' && x[x.size-1] == ']'
53
+ @rhs << NT.new(x.gsub!(/(\[|\])/, "").split(',')[0])
54
+ else
55
+ @rhs << T.new(x)
56
+ end
57
+ }
58
+ @e = _[2]
59
+ end
60
+
61
+ def self.from_s s
62
+ r = self.new
63
+ r.from_s s
64
+ return r
65
+ end
66
+ end
67
+
68
+ class Span
69
+ attr_accessor :left, :right
70
+
71
+ def initialize left=nil, right=nil
72
+ @left = left
73
+ @right = right
74
+ end
75
+ end
76
+
77
+ class Grammar
78
+ attr_accessor :rules, :startn, :startt, :flat
79
+
80
+ def initialize fn
81
+ @rules = []; @startn = []; @startt = [] ;@flat = []
82
+ ReadFile.readlines_strip(fn).each_with_index { |s,i|
83
+ STDERR.write '.'; STDERR.write " #{i+1}\n" if (i+1)%80==0
84
+ @rules << Rule.from_s(s)
85
+ if @rules.last.rhs.first.class == NT
86
+ @startn << @rules.last
87
+ else
88
+ if rules.last.arity == 0
89
+ @flat << @rules.last
90
+ else
91
+ @startt << @rules.last
92
+ end
93
+ end
94
+ }
95
+ STDERR.write "\n"
96
+ end
97
+
98
+ def to_s
99
+ s = ''
100
+ @rules.each { |r| s += r.to_s+"\n" }
101
+ return s
102
+ end
103
+
104
+ def add_glue_rules
105
+ @rules.map { |r| r.lhs.symbol }.select { |s| s != 'S' }.uniq.each { |symbol|
106
+ @rules << Rule.new(NT.new('S'), [NT.new(symbol)])
107
+ @startn << @rules.last
108
+ @rules << Rule.new(NT.new('S'), [NT.new('S'), NT.new('X')])
109
+ @startn << @rules.last
110
+ }
111
+ end
112
+
113
+ def add_pass_through_rules s
114
+ s.each { |word|
115
+ @rules << Rule.new(NT.new('X'), [T.new(word)])
116
+ @flat << @rules.last
117
+ }
118
+ end
119
+ end
120
+
121
+
122
+ end #module
123
+
data/lib/zipf/hg.rb ADDED
@@ -0,0 +1,173 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative 'semirings'
4
+ require 'json'
5
+
6
+
7
+ module HG
8
+
9
+
10
+ class HG::Node
11
+ attr_accessor :label, :cat, :outgoing, :incoming, :score
12
+
13
+ def initialize label=nil, cat=nil, outgoing=[], incoming=[], score=nil
14
+ @label = label
15
+ @cat = cat
16
+ @outgoing = outgoing
17
+ @incoming = incoming
18
+ @score = nil
19
+ end
20
+
21
+ def to_s
22
+ "Node<label:\"#{@label}\", cat:\"#{@cat}\", outgoing:#{@outgoing.size}, incoming:#{@incoming.size}>"
23
+ end
24
+ end
25
+
26
+ class HG::Hypergraph
27
+ attr_accessor :nodes, :edges
28
+
29
+ def initialize nodes=[], edges=[]
30
+ @nodes = nodes
31
+ @edges = edges
32
+ end
33
+
34
+ def arity
35
+ @edges.map { |e| e.arity }.max
36
+ end
37
+
38
+ def to_s
39
+ "Hypergraph<nodes:[#{@nodes.to_s}], edges:[#{@edges.to_s}], arity:#{arity}>"
40
+ end
41
+ end
42
+
43
+ class HG::Hyperedge
44
+ attr_accessor :head, :tails, :weight, :f, :mark, :rule, :left, :right
45
+
46
+ def initialize head=nil, tails=[], weight=0.0, f={}
47
+ @head = head
48
+ @tails = tails
49
+ @weight = weight
50
+ @f = f
51
+ @mark = 0
52
+ end
53
+
54
+ def arity
55
+ return @tails.size
56
+ end
57
+
58
+ def marked?
59
+ arity == @mark
60
+ end
61
+
62
+ def to_s
63
+ "Hyperedge<head:\"#{@head.label}\", \"tails:#{@tails.map{|n|n.label}}, arity:#{arity}, weight:#{@weight}, f:#{f.to_s}, mark:#{@mark}>"
64
+ end
65
+ end
66
+
67
+ def HG::topological_sort nodes
68
+ sorted = []
69
+ s = nodes.reject { |n| !n.incoming.empty? }
70
+ while !s.empty?
71
+ sorted << s.shift
72
+ sorted.last.outgoing.each { |e|
73
+ next if e.marked?
74
+ e.mark += 1
75
+ s << e.head if e.head.incoming.reject{ |f| f.mark==f.arity }.empty?
76
+ }
77
+ end
78
+ return sorted
79
+ end
80
+
81
+ def HG::init nodes, semiring, root
82
+ nodes.each { |n| n.score=semiring.null }
83
+ root.score = semiring.one
84
+ end
85
+
86
+ def HG::viterbi hypergraph, root, semiring=ViterbiSemiring.new
87
+ toposorted = topological_sort hypergraph.nodes
88
+ init toposorted, semiring, root
89
+ toposorted.each { |n|
90
+ n.incoming.each { |e|
91
+ s = semiring.one
92
+ e.tails.each { |m|
93
+ s = semiring.multiply.call(s, m.score)
94
+ }
95
+ n.score = semiring.add.call(n.score, semiring.multiply.call(s, e.weight))
96
+ }
97
+ }
98
+ end
99
+
100
+ def HG::viterbi_path hypergraph, root, semiring=ViterbiSemiring.new
101
+ toposorted = topological_sort hypergraph.nodes
102
+ init toposorted, semiring, root
103
+ best_path = []
104
+ toposorted.each { |n|
105
+ best_edge = nil
106
+ n.incoming.each { |e|
107
+ s = semiring.one
108
+ e.tails.each { |m|
109
+ s = semiring.multiply.call(s, m.score)
110
+ }
111
+ if n.score < semiring.multiply.call(s, e.weight) # ViterbiSemiring add
112
+ best_edge = e
113
+ end
114
+ n.score = semiring.add.call(n.score, semiring.multiply.call(s, e.weight))
115
+ }
116
+ best_path << best_edge
117
+ }
118
+ return best_path, toposorted.last.score
119
+ end
120
+
121
+ def HG::read_hypergraph_from_json fn, semiring=RealSemiring.new, log_weights=false
122
+ nodes = []
123
+ edges = []
124
+ nodes_by_label = {}
125
+ nodes_by_index = []
126
+ h = JSON.parse File.new(fn).read
127
+ w = SparseVector.from_h h['weights']
128
+ h['nodes'].each { |i|
129
+ n = Node.new i['label'], i['cat']
130
+ nodes << n
131
+ nodes_by_label[n.label] = n
132
+ nodes_by_index << n
133
+ }
134
+ h['edges'].each { |i|
135
+ e = Hyperedge.new(nodes_by_label[i['head']], \
136
+ i['tails'].map{|j| nodes_by_label[j]}.to_a, \
137
+ semiring.convert.call(i['weight'].to_f), \
138
+ {})
139
+ e.f = SparseVector.from_h i['f']
140
+ if log_weights
141
+ e.weight = Math.exp(w.dot(e.f))
142
+ else
143
+ e.weight = w.dot(e.f)
144
+ end
145
+ e.tails.each { |m|
146
+ m.outgoing << e
147
+ }
148
+ e.head.incoming << e
149
+ edges << e
150
+ }
151
+ return Hypergraph.new(nodes, edges), nodes_by_label, nodes_by_index
152
+ end
153
+
154
+ def HG::all_paths hypergraph, root, semiring=ViterbiSemiring.new
155
+ toposorted = topological_sort hypergraph.nodes
156
+ paths = [[]]
157
+ toposorted.each { |n|
158
+ next if n.incoming.empty?
159
+ new_paths = []
160
+ while !paths.empty?
161
+ p = paths.pop
162
+ n.incoming.each { |e|
163
+ new_paths << p+[e]
164
+ }
165
+ end
166
+ paths = new_paths
167
+ }
168
+ return paths
169
+ end
170
+
171
+
172
+ end #module
173
+
data/lib/zipf/misc.rb ADDED
@@ -0,0 +1,114 @@
1
+ require 'timeout'
2
+
3
+
4
+ class Array
5
+ def max_index
6
+ self.index(self.max)
7
+ end
8
+
9
+ def is_subset_of? other
10
+ self.each { |i|
11
+ if other.include? i
12
+ return false
13
+ end
14
+ }
15
+ return true
16
+ end
17
+
18
+ def sum
19
+ self.inject(:+)
20
+ end
21
+
22
+ def mean
23
+ self.sum.to_f/self.size
24
+ end
25
+ end
26
+
27
+ class String
28
+
29
+ def downcase?
30
+ self[/[[:lower:]]/]
31
+ end
32
+ end
33
+
34
+ class PriorityQueue
35
+ # This assumes that elements in the queue
36
+ # have a numerical member named 'score'.
37
+
38
+ def initialize a=Array.new
39
+ @queue = Array.new a
40
+ sort!
41
+ end
42
+
43
+ def sort!
44
+ @queue.sort_by! { |i| -i.score }
45
+ end
46
+
47
+ def pop
48
+ @queue.pop
49
+ end
50
+
51
+ def push i
52
+ @queue << i
53
+ sort!
54
+ end
55
+
56
+ def empty?
57
+ @queue.empty?
58
+ end
59
+ end
60
+
61
+ def spawn_with_timeout cmd, t=4, ignore_fail=false, debug=false
62
+ STDERR.write cmd+"\n" if debug
63
+ pipe_in, pipe_out = IO.pipe
64
+ pid = Process.spawn(cmd, :out => pipe_out)
65
+ begin
66
+ Timeout.timeout(t) { Process.wait pid }
67
+ rescue Timeout::Error
68
+ Process.kill('TERM', pid) if !ignore_fail
69
+ end
70
+ pipe_out.close
71
+ return pipe_in.read
72
+ end
73
+
74
+ def read_phrase_table fn
75
+ table = {}
76
+ f = ReadFile.new fn
77
+ while raw_rule = f.gets
78
+ french, english, features = splitpipe(raw_rule)
79
+ feature_map = SparseVector.from_kv features
80
+ if table.has_key? french
81
+ table[french] << [english, feature_map ]
82
+ else
83
+ table[french] = [[english, feature_map]]
84
+ end
85
+ end
86
+ f.close
87
+ return table
88
+ end
89
+
90
+ def cdec_kbest cdec_bin, input, ini, weights, k, unique=true
91
+ require 'open3'
92
+ cmd = "echo \"#{input}\" | #{cdec_bin} -c #{ini} -w #{weights} -k #{k}"
93
+ cmd += " -r" if unique
94
+ o,_ = Open3.capture2 "#{cmd} 2>/dev/null"
95
+ a = []; j = -1
96
+ o.split("\n").map{ |i| j+=1; t=Translation.new; t.from_s(i, false, j); a << t }
97
+ return a
98
+ end
99
+
100
+ def read_config fn
101
+ f = ReadFile.new fn
102
+ cfg = {}
103
+ while line = f.gets
104
+ line.strip!
105
+ next if /^\s*$/.match line
106
+ next if line[0]=='#'
107
+ content = line.split('#', 2).first
108
+ k, v = content.split(/\s*=\s*/, 2)
109
+ k.strip!; v.strip!
110
+ cfg[k] = v
111
+ end
112
+ return cfg
113
+ end
114
+
@@ -0,0 +1,81 @@
1
+ # Semirings for directed acyclic graphs (dags) (also directed hypergraphs),
2
+ # as described in:
3
+ # 'Dynamic Programming Algorithms in
4
+ # Semiring and Hypergraph Frameworks' (Liang Huang)
5
+ #
6
+
7
+ class Semiring
8
+ attr_accessor :add, :multiply, :one, :null, :convert
9
+ end
10
+
11
+ class BooleanSemiring < Semiring
12
+ def initialize
13
+ @add = Proc.new { |a,b| a||b }
14
+ @multiply = Proc.new { |a,b| a&&b }
15
+ @one = true
16
+ @null = false
17
+ @convert = Proc.new { |v| true && v!=0 }
18
+ end
19
+ end
20
+
21
+ class ViterbiSemiring < Semiring
22
+ def initialize
23
+ @add = Proc.new { |a,b| [a,b].max }
24
+ @multiply = Proc.new { |a,b| a*b }
25
+ @one = 1.0
26
+ @null = 0.0
27
+ @convert = Proc.new { |v| v }
28
+ end
29
+ end
30
+
31
+ class ViterbiLogSemiring < Semiring
32
+ def initialize
33
+ @add = Proc.new { |a,b| [a,b].max }
34
+ @multiply = Proc.new { |a,b| a+b }
35
+ @one = 0.0
36
+ @null = -1.0/0.0
37
+ @convert = Proc.new { |v| v }
38
+ end
39
+ end
40
+
41
+ class InsideSemiring < Semiring
42
+ def initialize
43
+ @add = Proc.new { |a,b| a+b }
44
+ @multiply = Proc.new { |a,b| a*b }
45
+ @one = 1.0
46
+ @null = 0.0
47
+ @convert = Proc.new { |v| v }
48
+ end
49
+ end
50
+
51
+ class RealSemiring < Semiring
52
+ def initialize
53
+ @add = Proc.new { |a,b| [a,b].min }
54
+ @multiply = Proc.new { |a,b| a+b }
55
+ @one = 0.0
56
+ @null = 1.0/0.0
57
+ @convert = Proc.new { |v| v }
58
+ end
59
+ end
60
+
61
+ # for longest/worst paths
62
+ class RealxSemiring < Semiring
63
+ def initialize
64
+ @add = Proc.new { |a,b| [a,b].max }
65
+ @multiply = Proc.new { |a,b| a+b }
66
+ @one = -1.0/0.0
67
+ @null = 0.0
68
+ @convert = Proc.new { |v| v }
69
+ end
70
+ end
71
+
72
+ class CountingSemiring < Semiring
73
+ def initialize
74
+ @add = Proc.new { |a,b| a+b }
75
+ @multiply = Proc.new { |a,b| a*b }
76
+ @one = 1.0
77
+ @null = 0.0
78
+ @convert = Proc.new { |v| if v!=0 then 1 else 0 end }
79
+ end
80
+ end
81
+
@@ -0,0 +1,22 @@
1
+ def tokenize s
2
+ s.strip.split
3
+ end
4
+
5
+ def ngrams(s, n, fix=false)
6
+ a = tokenize s
7
+ a.each_with_index { |tok, i|
8
+ tok.strip!
9
+ 0.upto([n-1, a.size-i-1].min) { |m|
10
+ yield a[i..i+m] if !fix||(fix&&a[i..i+m].size==n)
11
+ }
12
+ }
13
+ end
14
+
15
+ def bag_of_words s, stopwords=[]
16
+ s.strip.split.uniq.sort.reject{ |w| stopwords.include? w }
17
+ end
18
+
19
+ def splitpipe s, n=3
20
+ s.strip.split("|"*n)
21
+ end
22
+
data/lib/zipf/tfidf.rb ADDED
@@ -0,0 +1,38 @@
1
+ module TFIDF
2
+
3
+
4
+ # returns key='raw frequency' for an
5
+ # array-like object
6
+ def TFIDF::tf array, stopwords=[]
7
+ v = {}; v.default = 0
8
+ array.uniq.each { |i|
9
+ next if stopwords.include? i
10
+ v[i] = array.count(i).to_f
11
+ }
12
+ return v
13
+ end
14
+
15
+ # smoothes raw frequencies of tf() in-place
16
+ # a is a smoothing term
17
+ def TFIDF::ntf hash, a=0.4
18
+ max = hash.values.max.to_f
19
+ hash.each_pair { |k,v|
20
+ hash[k] = a + (1-a)*(v/max)
21
+ }
22
+ end
23
+
24
+ # returns idf value for each word in a vocabulary
25
+ def TFIDF::idf list_of_hashes
26
+ vocab = list_of_hashes.values.flatten.uniq
27
+ n = list_of_hashes.size.to_f
28
+ idf = {}
29
+ vocab.each { |i|
30
+ df = list_of_hashes.values.flatten.count i
31
+ idf[i] = Math.log(n/df)
32
+ }
33
+ return idf
34
+ end
35
+
36
+
37
+ end #module
38
+
metadata ADDED
@@ -0,0 +1,55 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: zipf
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Patrick Simianer
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-06-16 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: NLP related tools and classes
14
+ email: p@simianer.de
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/zipf.rb
20
+ - lib/zipf/stringutil.rb
21
+ - lib/zipf/misc.rb
22
+ - lib/zipf/grammar.rb
23
+ - lib/zipf/hg.rb
24
+ - lib/zipf/fileutil.rb
25
+ - lib/zipf/semirings.rb
26
+ - lib/zipf/dag.rb
27
+ - lib/zipf/SparseVector.rb
28
+ - lib/zipf/tfidf.rb
29
+ - lib/zipf/bleu.rb
30
+ - lib/zipf/Translation.rb
31
+ homepage: http://simianer.de
32
+ licenses:
33
+ - MIT
34
+ metadata: {}
35
+ post_install_message:
36
+ rdoc_options: []
37
+ require_paths:
38
+ - lib
39
+ required_ruby_version: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ required_rubygems_version: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - '>='
47
+ - !ruby/object:Gem::Version
48
+ version: '0'
49
+ requirements: []
50
+ rubyforge_project:
51
+ rubygems_version: 2.0.3
52
+ signing_key:
53
+ specification_version: 4
54
+ summary: zipf
55
+ test_files: []