zipf 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e8021bb7a07d98332028ec75ff1c3bf53149cab3
4
+ data.tar.gz: 958c53844c7e0b1b76b44bcc26fe60736449a1cd
5
+ SHA512:
6
+ metadata.gz: c444514cec3f6154c9011db7aac92b579e046dea3c88e24781db728f46cb67c1e789c50007edbc4dfe202c448a86b158f170ac3a6baf5cd4cae4ff5fd422b5c7
7
+ data.tar.gz: 43c54fa8adf44ef26d0894bb00a8de9e8ae30cc79efacbf40d20f7c8bc116a6cbecd2f641ccc23c582ddb4e8dc69627958d7b79c57f6fd1cd5f250fed3df6c50
data/lib/zipf.rb ADDED
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'zipf/stringutil'
4
+ require 'zipf/fileutil'
5
+ require 'zipf/SparseVector'
6
+ require 'zipf/tfidf'
7
+ require 'zipf/Translation'
8
+ require 'zipf/dag'
9
+ require 'zipf/semirings'
10
+ require 'zipf/bleu'
11
+ require 'zipf/misc'
12
+ require 'zipf/hg'
13
+ require 'zipf/grammar'
14
+
15
+ STDIN.set_encoding 'utf-8'
16
+ STDOUT.set_encoding 'utf-8'
17
+ STDERR.set_encoding 'utf-8'
18
+
@@ -0,0 +1,172 @@
1
+ class SparseVector < Hash
2
+
3
+ def initialize arg=nil
4
+ super
5
+ self.default = 0
6
+ if arg.is_a? Array
7
+ from_a arg
8
+ end
9
+ end
10
+
11
+ def from_a a
12
+ a.each_with_index { |i,j| self[j] = i }
13
+ end
14
+
15
+ def self.from_a a
16
+ v = SparseVector.new
17
+ v.from_a a
18
+ return v
19
+ end
20
+
21
+ def from_h h
22
+ h.each_pair { |k,v| self[k] = v }
23
+ end
24
+
25
+ def self.from_h h
26
+ v = SparseVector.new
27
+ v.from_h h
28
+ return v
29
+ end
30
+
31
+ def from_s s
32
+ from_h eval(s)
33
+ end
34
+
35
+ def self.from_s s
36
+ v = SparseVector.new
37
+ v.from_s s
38
+ return v
39
+ end
40
+
41
+ def to_kv sep='=', join=' '
42
+ a = []
43
+ self.each_pair { |k,v|
44
+ a << "#{k}#{sep}#{v}"
45
+ }
46
+ return a.join join
47
+ end
48
+
49
+ def from_kv s
50
+ s.split.each { |i|
51
+ k,v = i.split('=')
52
+ self[k] = v.to_f
53
+ }
54
+ end
55
+
56
+ def self.from_kv s
57
+ v = SparseVector.new
58
+ v.from_kv s
59
+ return v
60
+ end
61
+
62
+ def from_file fn, sep='='
63
+ f = ReadFile.new(fn)
64
+ while line = f.gets
65
+ key, value = line.strip.split sep
66
+ value = value.to_f
67
+ self[key] = value
68
+ end
69
+ end
70
+
71
+ def self.from_file fn, sep='='
72
+ v = SparseVector.new
73
+ v.from_file fn, sep
74
+ return v
75
+ end
76
+
77
+ def join_keys other
78
+ self.keys + other.keys
79
+ end
80
+
81
+ def sum
82
+ self.values.inject(:+)
83
+ end
84
+
85
+ def approx_eql? other, p=10**-10
86
+ return false if !other
87
+ return false if other.size!=self.size
88
+ return false if other.keys.sort!=self.keys.sort
89
+ self.keys.each { |k|
90
+ return false if (self[k]-other[k]).abs>p
91
+ }
92
+ return true
93
+ end
94
+
95
+ def average
96
+ self.sum/self.size.to_f
97
+ end
98
+
99
+ def variance
100
+ avg = self.average
101
+ var = 0.0
102
+ self.values.each { |i| var += (avg - i)**2 }
103
+ return var
104
+ end
105
+
106
+ def stddev
107
+ Math.sqrt self.variance
108
+ end
109
+
110
+ def dot other
111
+ sum = 0.0
112
+ self.each_pair { |k,v| sum += v * other[k] }
113
+ return sum
114
+ end
115
+
116
+ def zeros n
117
+ (0).upto(n-1) { |i| self[i] = 0.0 }
118
+ end
119
+
120
+ def magnitude
121
+ Math.sqrt self.values.inject { |sum,i| sum+i**2 }
122
+ end
123
+
124
+ def cosinus_sim other
125
+ self.dot(other)/(self.magnitude*other.magnitude)
126
+ end
127
+
128
+ def euclidian_dist other
129
+ dims = [self.keys, other.keys].flatten.uniq
130
+ sum = 0.0
131
+ dims.each { |d| sum += (self[d] - other[d])**2 }
132
+ return Math.sqrt(sum)
133
+ end
134
+
135
+ def + other
136
+ new = SparseVector.new
137
+ join_keys(other).each { |k|
138
+ new[k] = self[k]+other[k]
139
+ }
140
+ return new
141
+ end
142
+
143
+ def - other
144
+ new = SparseVector.new
145
+ join_keys(other).each { |k|
146
+ new[k] = self[k]-other[k]
147
+ }
148
+ return new
149
+ end
150
+
151
+ def * scalar
152
+ raise ArgumentError, "Arg is not numeric #{scalar}" unless scalar.is_a? Numeric
153
+ new = SparseVector.new
154
+ self.keys.each { |k|
155
+ new[k] = self[k] * scalar
156
+ }
157
+ return new
158
+ end
159
+
160
+ def self.mean a
161
+ mean = SparseVector.new
162
+ a.each { |i|
163
+ i.each_pair { |k,v|
164
+ mean[k] += v
165
+ }
166
+ }
167
+ n = a.size.to_f
168
+ mean.each_pair { |k,v| mean[k] = v/n }
169
+ return mean
170
+ end
171
+ end
172
+
@@ -0,0 +1,72 @@
1
+ class Translation
2
+ attr_accessor :id, :s, :raw, :f, :scores, :rank
3
+
4
+ def initialize id=nil, raw=nil, s=nil, f=nil, scores={}, rank=nil
5
+ @id = id
6
+ @raw = raw
7
+ @s = s
8
+ @f = f
9
+ @scores = scores
10
+ @rank = rank
11
+ end
12
+
13
+ def from_s t, strip_alignment=true, rank=nil
14
+ id, raw, features, score = splitpipe(t, 3)
15
+ raw.strip!
16
+ @raw = raw
17
+ if strip_alignment # the way moses does it
18
+ @s = @raw.gsub(/\s*\|\d+-\d+\||\|-?\d+\|\s*/, ' ').gsub(/\s+/, ' ')
19
+ @s.strip!
20
+ else
21
+ @s = raw
22
+ end
23
+ @id = id.to_i
24
+ @f = SparseVector.from_kv features
25
+ @scores[:decoder] = score.to_f
26
+ @rank = rank
27
+ end
28
+
29
+ def self.from_s s
30
+ t = self.new
31
+ t.from_s s
32
+ return t
33
+ end
34
+
35
+ def to_s include_features=true
36
+ [@id, @s, @f.to_kv('=', ' '), @scores[:decoder]].join(' ||| ') if include_features
37
+ [@id, @s, @scores[:decoder]].join(' ||| ') if !include_features
38
+ end
39
+
40
+ def to_s2
41
+ [@rank, @s, @score, @scores.to_s].join ' ||| '
42
+ end
43
+ end
44
+
45
+ def read_kbest_lists fn, translation_type=Translation
46
+ kbest_lists = []
47
+ cur = []
48
+ f = ReadFile.new fn
49
+ prev = -1
50
+ c = 0
51
+ id = 0
52
+ while line = f.gets
53
+ t = translation_type.new
54
+ t.from_s line
55
+ c = splitpipe(line)[0].to_i
56
+ if c != prev
57
+ if cur.size > 0
58
+ kbest_lists << cur
59
+ cur = []
60
+ end
61
+ prev = c
62
+ id = 0
63
+ end
64
+ t.id = id
65
+ cur << t
66
+ id += 1
67
+ end
68
+ kbest_lists << cur # last one
69
+ f.close
70
+ return kbest_lists
71
+ end
72
+
data/lib/zipf/bleu.rb ADDED
@@ -0,0 +1,130 @@
1
+ module BLEU
2
+
3
+
4
+ class BLEU::NgramCounts
5
+ attr_accessor :sum, :clipped, :ref_len, :hyp_len, :n
6
+
7
+ def initialize(n)
8
+ @n = 0
9
+ @sum = []
10
+ @clipped = []
11
+ @ref_len = 0.0
12
+ @hyp_len = 0.0
13
+ grow(n)
14
+ end
15
+
16
+ def grow(n)
17
+ (n-@n).times {
18
+ @sum << 0.0
19
+ @clipped << 0.0
20
+ }
21
+ @n = n
22
+ end
23
+
24
+ def plus_eq(other)
25
+ if other.n > @n then grow(other.n) end
26
+ 0.upto(other.n-1) { |m|
27
+ @sum[m] += other.sum[m]
28
+ @clipped[m] += other.clipped[m]
29
+ }
30
+ @ref_len += other.ref_len
31
+ @hyp_len += other.hyp_len
32
+ end
33
+
34
+ def to_s
35
+ return "n=#{n} sum=#{sum} clipped=#{clipped} ref_len=#{ref_len} hyp_len=#{hyp_len}"
36
+ end
37
+ end
38
+
39
+ class BLEU::Ngrams
40
+ def initialize
41
+ @h_ = {}
42
+ @h_.default = 0
43
+ end
44
+
45
+ def add(k)
46
+ if k.class == Array then k = k.join ' ' end
47
+ @h_[k] += 1
48
+ end
49
+
50
+ def get_count(k)
51
+ if k.class == Array then k = k.join ' ' end
52
+ return @h_[k]
53
+ end
54
+
55
+ def each
56
+ @h_.each_pair { |k,v|
57
+ yield k.split, v
58
+ }
59
+ end
60
+
61
+ def to_s
62
+ @h_.to_s
63
+ end
64
+ end
65
+
66
+ def BLEU::get_counts hypothesis, reference, n, times=1
67
+ p = NgramCounts.new n
68
+ r = Ngrams.new
69
+ ngrams(reference, n) { |ng| r.add ng }
70
+ h = Ngrams.new
71
+ ngrams(hypothesis, n) { |ng| h.add ng }
72
+ h.each { |ng,count|
73
+ sz = ng.size-1
74
+ p.sum[sz] += count * times
75
+ p.clipped[sz] += [r.get_count(ng), count].min * times
76
+ }
77
+ p.ref_len = tokenize(reference.strip).size * times
78
+ p.hyp_len = tokenize(hypothesis.strip).size * times
79
+ return p
80
+ end
81
+
82
+ def BLEU::brevity_penalty c, r, smooth=0.0
83
+ return [0.0, 1.0-((r+smooth)/c)].min
84
+ end
85
+
86
+ def BLEU::bleu counts, n, debug=false
87
+ corpus_stats = NgramCounts.new n
88
+ counts.each { |i| corpus_stats.plus_eq i }
89
+ logbleu = 0.0
90
+ 0.upto(n-1) { |m|
91
+ STDERR.write "#{m+1} #{corpus_stats.clipped[m]} / #{corpus_stats.sum[m]}\n" if debug
92
+ return 0.0 if corpus_stats.clipped[m] == 0 or corpus_stats.sum == 0
93
+ logbleu += Math.log(corpus_stats.clipped[m]) - Math.log(corpus_stats.sum[m])
94
+ }
95
+ logbleu /= n
96
+ if debug
97
+ STDERR.write "BP #{brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len)}\n"
98
+ STDERR.write "sum #{Math.exp(sum)}\n"
99
+ end
100
+ logbleu += brevity_penalty corpus_stats.hyp_len, corpus_stats.ref_len
101
+ return Math.exp logbleu
102
+ end
103
+
104
+ def BLEU::hbleu counts, n, debug=false
105
+ (100*bleu(counts, n, debug)).round(3)
106
+ end
107
+
108
+ def BLEU::per_sentence_bleu hypothesis, reference, n=4, smooth=0.0
109
+ h_ng = {}; r_ng = {}
110
+ (1).upto(n) { |i| h_ng[i] = []; r_ng[i] = [] }
111
+ ngrams(hypothesis, n) { |i| h_ng[i.size] << i }
112
+ ngrams(reference, n) { |i| r_ng[i.size] << i }
113
+ m = [n, reference.split.size].min
114
+ add = 0.0
115
+ logbleu = 0.0
116
+ (1).upto(m) { |i|
117
+ counts_clipped = 0
118
+ counts_sum = h_ng[i].size
119
+ h_ng[i].uniq.each { |j| counts_clipped += r_ng[i].count(j) }
120
+ add = 1.0 if i >= 2
121
+ logbleu += Math.log(counts_clipped+add) - Math.log(counts_sum+add);
122
+ }
123
+ logbleu /= m
124
+ logbleu += brevity_penalty hypothesis.strip.split.size, reference.strip.split.size, smooth
125
+ return Math.exp logbleu
126
+ end
127
+
128
+
129
+ end #module
130
+
data/lib/zipf/dag.rb ADDED
@@ -0,0 +1,205 @@
1
+ module DAG
2
+
3
+ require 'json'
4
+
5
+
6
+ class DAG::Node
7
+ attr_accessor :label, :outgoing, :incoming, :score, :mark
8
+
9
+ def initialize label=nil, outgoing=[], incoming=[], score=nil
10
+ @label = label
11
+ @outgoing = outgoing
12
+ @incoming = incoming
13
+ @score = nil
14
+ end
15
+
16
+ def add_edge head, weight=0
17
+ exit if self==head # no self-cycles!
18
+ @outgoing << DAG::Edge.new(self, head, weight)
19
+ return @outgoing.last
20
+ end
21
+
22
+ def to_s
23
+ "DAG::Node<label:#{label}, outgoing:#{outgoing.size}, incoming:#{incoming.size}>"
24
+ end
25
+
26
+ def repr
27
+ "#{to_s} #{@score} out:#{@outgoing} in:[#{@incoming.map{|e| e.to_s}.join ', '}]"
28
+ end
29
+ end
30
+
31
+ class DAG::Edge
32
+ attr_accessor :tail, :head, :weight, :mark
33
+
34
+ def initialize tail=nil, head=nil, weight=0
35
+ @tail = tail
36
+ @head = head
37
+ @weight = weight
38
+ @mark = false # did we already follow this edge? -- for topological sorting
39
+ end
40
+
41
+ def to_s
42
+ s = "DAG::Edge<#{@tail} ->[#{weight}] #{@head}"
43
+ s += " x" if @mark
44
+ s += ">"
45
+ s
46
+ end
47
+ end
48
+
49
+ # depth-first search
50
+ # w/o markings as we do not have cycles
51
+ def DAG::dfs n, target_label
52
+ return n if n.label==target_label # assumes uniq labels!
53
+ stack = n.outgoing.map { |i| i.head }
54
+ while !stack.empty?
55
+ m = stack.pop
56
+ return DAG::dfs m, target_label
57
+ end
58
+ return nil
59
+ end
60
+
61
+ # breadth-first search
62
+ # w/o markings as we do not have cycles
63
+ def DAG::bfs n, target_label
64
+ queue = [n]
65
+ while !queue.empty?
66
+ m = queue.shift
67
+ return m if m.label==target_label
68
+ m.outgoing.each { |e| queue << e.head }
69
+ end
70
+ return nil
71
+ end
72
+
73
+ # topological sort
74
+ def DAG::topological_sort graph
75
+ sorted = []
76
+ s = graph.reject { |n| !n.incoming.empty? }
77
+ while !s.empty?
78
+ sorted << s.shift
79
+ sorted.last.outgoing.each { |e|
80
+ e.mark = true
81
+ s << e.head if e.head.incoming.reject{|f| f.mark}.empty?
82
+ }
83
+ end
84
+ return sorted
85
+ end
86
+
87
+ # initialize graph scores with semiring One
88
+ def DAG::init graph, semiring, source_node
89
+ graph.each {|n| n.score=semiring.null}
90
+ source_node.score = semiring.one
91
+ end
92
+
93
+ # viterbi
94
+ def DAG::viterbi graph, semiring=ViterbiSemiring, source_node
95
+ toposorted = DAG::topological_sort(graph)
96
+ DAG::init(graph, semiring, source_node)
97
+ toposorted.each { |n|
98
+ n.incoming.each { |e|
99
+ # update
100
+ n.score = \
101
+ semiring.add.call(n.score, \
102
+ semiring.multiply.call(e.tail.score, e.weight)
103
+ )
104
+ }
105
+ }
106
+ end
107
+
108
+ # forward viterbi
109
+ def DAG::viterbi_forward graph, semiring=ViterbiSemiring, source_node
110
+ toposorted = DAG::topological_sort(graph)
111
+ DAG::init(graph, semiring, source_node)
112
+ toposorted.each { |n|
113
+ n.outgoing.each { |e|
114
+ e.head.score = \
115
+ semiring.add.call(e.head.score, \
116
+ semiring.multiply.call(n.score, e.weight)
117
+ )
118
+ }
119
+ }
120
+ end
121
+
122
+ # Dijkstra algorithm
123
+ # for A*-search we would need an optimistic estimate of
124
+ # future cost at each node
125
+ def DAG::dijkstra graph, semiring=RealSemiring.new, source_node
126
+ DAG::init(graph, semiring, source_node)
127
+ q = PriorityQueue.new graph
128
+ while !q.empty?
129
+ n = q.pop
130
+ n.outgoing.each { |e|
131
+ e.head.score = \
132
+ semiring.add.call(e.head.score, \
133
+ semiring.multiply.call(n.score, e.weight))
134
+ q.sort!
135
+ }
136
+ end
137
+ end
138
+
139
+ # Bellman-Ford algorithm
140
+ def DAG::bellman_ford(graph, semiring=RealSemiring.new, source_node)
141
+ DAG::init(graph, semiring, source_node)
142
+ edges = []
143
+ graph.each { |n| edges |= n.outgoing }
144
+ # relax edges
145
+ (graph.size-1).times{ |i|
146
+ edges.each { |e|
147
+ e.head.score = \
148
+ semiring.add.call(e.head.score, \
149
+ semiring.multiply.call(e.tail.score, e.weight))
150
+ }
151
+ }
152
+ # we do not allow cycles (negative or positive)
153
+ end
154
+
155
+ # Floyd algorithm
156
+ def DAG::floyd(graph, semiring=nil)
157
+ dist_matrix = []
158
+ graph.each_index { |i|
159
+ dist_matrix << []
160
+ graph.each_index { |j|
161
+ val = 1.0/0.0
162
+ val = 0.0 if i==j
163
+ dist_matrix.last << val
164
+ }
165
+ }
166
+ edges = []
167
+ graph.each { |n| edges |= n.outgoing }
168
+ edges.each { |e|
169
+ dist_matrix[graph.index(e.tail)][graph.index(e.head)] = e.weight
170
+ }
171
+ 0.upto(graph.size-1) { |k|
172
+ 0.upto(graph.size-1) { |i|
173
+ 0.upto(graph.size-1) { |j|
174
+ if dist_matrix[i][k] + dist_matrix[k][j] < dist_matrix[i][j]
175
+ dist_matrix [i][j] = dist_matrix[i][k] + dist_matrix[k][j]
176
+ end
177
+ }
178
+ }
179
+ }
180
+ return dist_matrix
181
+ end
182
+
183
+
184
+ # returns a list of nodes (graph) and a hash for finding
185
+ # nodes by their label (these need to be unique!)
186
+ def DAG::read_graph_from_json fn, semiring=RealSemiring.new
187
+ graph = []
188
+ nodes_by_label = {}
189
+ h = JSON.parse File.new(fn).read
190
+ h['nodes'].each { |i|
191
+ n = DAG::Node.new i['label']
192
+ graph << n
193
+ nodes_by_label[n.label] = n
194
+ }
195
+ h['edges'].each { |i|
196
+ n = nodes_by_label[i['tail']]
197
+ a = n.add_edge(nodes_by_label[i['head']], semiring.convert.call(i['weight'].to_f))
198
+ nodes_by_label[i['head']].incoming << a
199
+ }
200
+ return graph, nodes_by_label
201
+ end
202
+
203
+
204
+ end #module
205
+
@@ -0,0 +1,88 @@
1
+ require 'zlib'
2
+
3
+
4
+ class ReadFile
5
+
6
+ def initialize fn, encoding='utf-8'
7
+ if fn.split('.').last == 'gz'
8
+ @f = Zlib::GzipReader.new(File.new(fn, 'rb'), :external_encoding=>encoding)
9
+ elsif fn == '-'
10
+ @f = STDIN
11
+ STDIN.set_encoding encoding
12
+ else
13
+ @f = File.new fn, 'r'
14
+ @f.set_encoding encoding
15
+ end
16
+ end
17
+
18
+ def gets
19
+ @f.gets { |line| yield line }
20
+ end
21
+
22
+ def readlines
23
+ @f.readlines
24
+ end
25
+
26
+ def self.readlines fn, encoding='utf-8'
27
+ f = ReadFile.new fn, encoding
28
+ r = f.readlines
29
+ f.close
30
+ return r
31
+ end
32
+
33
+ def readlines_strip
34
+ self.readlines.map{ |i| i.strip }
35
+ end
36
+
37
+ def self.readlines_strip fn, encoding='utf-8'
38
+ f = ReadFile.new fn, encoding
39
+ r = f.readlines_strip
40
+ f.close
41
+ return r
42
+ end
43
+
44
+ def read
45
+ @f.read
46
+ end
47
+
48
+ def self.read fn, encoding='utf-8'
49
+ f = ReadFile.new fn, encoding
50
+ r = f.read
51
+ f.close
52
+ return r
53
+ end
54
+
55
+ def close
56
+ @f.close if @f!=STDIN
57
+ end
58
+ end
59
+
60
+ class WriteFile
61
+
62
+ def initialize fn, encoding='utf-8'
63
+ if fn.split('.').last == 'gz'
64
+ @f = Zlib::GzipWriter.new(File.new(fn, 'wb+'), :external_encoding=>encoding)
65
+ elsif fn == '-'
66
+ @f = STDOUT
67
+ STDOUT.set_encoding encoding
68
+ else
69
+ @f = File.new fn, 'w+'
70
+ @f.set_encoding encoding
71
+ end
72
+ end
73
+
74
+ def write s
75
+ @f.write s
76
+ end
77
+
78
+ def self.write s, fn, encoding='utf-8'
79
+ f = WriteFile.new fn, encoding
80
+ f.write s
81
+ f.close
82
+ end
83
+
84
+ def close
85
+ @f.close if @f!=STDIN
86
+ end
87
+ end
88
+
@@ -0,0 +1,123 @@
1
+ module Grammar
2
+
3
+
4
+ class T
5
+ attr_accessor :word
6
+
7
+ def initialize word
8
+ @word = word
9
+ end
10
+
11
+ def to_s
12
+ "T<#{@word}>"
13
+ end
14
+ end
15
+
16
+ class NT
17
+ attr_accessor :symbol, :index, :span
18
+
19
+ def initialize symbol, index=0
20
+ @symbol = symbol
21
+ @index = index
22
+ @span = Span.new
23
+ end
24
+
25
+ def to_s
26
+ "NT(#{@span.left},#{@span.right})<#{@symbol},#{@index}>"
27
+ end
28
+ end
29
+
30
+ class Rule
31
+ attr_accessor :lhs, :rhs, :e
32
+
33
+ def initialize lhs=nil, rhs=[], e=''
34
+ @lhs = lhs
35
+ @rhs = rhs
36
+ @e = e
37
+ end
38
+
39
+ def to_s
40
+ "#{lhs} -> #{rhs.map{ |i| i.to_s }.join ' '} [arity=#{arity}] ||| #{@e}"
41
+ end
42
+
43
+ def arity
44
+ rhs.select { |i| i.class == NT }.size
45
+ end
46
+
47
+ def from_s s
48
+ _ = splitpipe s, 3
49
+ @lhs = NT.new _[0].strip.gsub!(/(\[|\])/, "")
50
+ _[1].split.each { |x|
51
+ x.strip!
52
+ if x[0]=='[' && x[x.size-1] == ']'
53
+ @rhs << NT.new(x.gsub!(/(\[|\])/, "").split(',')[0])
54
+ else
55
+ @rhs << T.new(x)
56
+ end
57
+ }
58
+ @e = _[2]
59
+ end
60
+
61
+ def self.from_s s
62
+ r = self.new
63
+ r.from_s s
64
+ return r
65
+ end
66
+ end
67
+
68
+ class Span
69
+ attr_accessor :left, :right
70
+
71
+ def initialize left=nil, right=nil
72
+ @left = left
73
+ @right = right
74
+ end
75
+ end
76
+
77
+ class Grammar
78
+ attr_accessor :rules, :startn, :startt, :flat
79
+
80
+ def initialize fn
81
+ @rules = []; @startn = []; @startt = [] ;@flat = []
82
+ ReadFile.readlines_strip(fn).each_with_index { |s,i|
83
+ STDERR.write '.'; STDERR.write " #{i+1}\n" if (i+1)%80==0
84
+ @rules << Rule.from_s(s)
85
+ if @rules.last.rhs.first.class == NT
86
+ @startn << @rules.last
87
+ else
88
+ if rules.last.arity == 0
89
+ @flat << @rules.last
90
+ else
91
+ @startt << @rules.last
92
+ end
93
+ end
94
+ }
95
+ STDERR.write "\n"
96
+ end
97
+
98
+ def to_s
99
+ s = ''
100
+ @rules.each { |r| s += r.to_s+"\n" }
101
+ return s
102
+ end
103
+
104
+ def add_glue_rules
105
+ @rules.map { |r| r.lhs.symbol }.select { |s| s != 'S' }.uniq.each { |symbol|
106
+ @rules << Rule.new(NT.new('S'), [NT.new(symbol)])
107
+ @startn << @rules.last
108
+ @rules << Rule.new(NT.new('S'), [NT.new('S'), NT.new('X')])
109
+ @startn << @rules.last
110
+ }
111
+ end
112
+
113
+ def add_pass_through_rules s
114
+ s.each { |word|
115
+ @rules << Rule.new(NT.new('X'), [T.new(word)])
116
+ @flat << @rules.last
117
+ }
118
+ end
119
+ end
120
+
121
+
122
+ end #module
123
+
data/lib/zipf/hg.rb ADDED
@@ -0,0 +1,173 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative 'semirings'
4
+ require 'json'
5
+
6
+
7
+ module HG
8
+
9
+
10
+ class HG::Node
11
+ attr_accessor :label, :cat, :outgoing, :incoming, :score
12
+
13
+ def initialize label=nil, cat=nil, outgoing=[], incoming=[], score=nil
14
+ @label = label
15
+ @cat = cat
16
+ @outgoing = outgoing
17
+ @incoming = incoming
18
+ @score = nil
19
+ end
20
+
21
+ def to_s
22
+ "Node<label:\"#{@label}\", cat:\"#{@cat}\", outgoing:#{@outgoing.size}, incoming:#{@incoming.size}>"
23
+ end
24
+ end
25
+
26
+ class HG::Hypergraph
27
+ attr_accessor :nodes, :edges
28
+
29
+ def initialize nodes=[], edges=[]
30
+ @nodes = nodes
31
+ @edges = edges
32
+ end
33
+
34
+ def arity
35
+ @edges.map { |e| e.arity }.max
36
+ end
37
+
38
+ def to_s
39
+ "Hypergraph<nodes:[#{@nodes.to_s}], edges:[#{@edges.to_s}], arity:#{arity}>"
40
+ end
41
+ end
42
+
43
+ class HG::Hyperedge
44
+ attr_accessor :head, :tails, :weight, :f, :mark, :rule, :left, :right
45
+
46
+ def initialize head=nil, tails=[], weight=0.0, f={}
47
+ @head = head
48
+ @tails = tails
49
+ @weight = weight
50
+ @f = f
51
+ @mark = 0
52
+ end
53
+
54
+ def arity
55
+ return @tails.size
56
+ end
57
+
58
+ def marked?
59
+ arity == @mark
60
+ end
61
+
62
+ def to_s
63
+ "Hyperedge<head:\"#{@head.label}\", \"tails:#{@tails.map{|n|n.label}}, arity:#{arity}, weight:#{@weight}, f:#{f.to_s}, mark:#{@mark}>"
64
+ end
65
+ end
66
+
67
+ def HG::topological_sort nodes
68
+ sorted = []
69
+ s = nodes.reject { |n| !n.incoming.empty? }
70
+ while !s.empty?
71
+ sorted << s.shift
72
+ sorted.last.outgoing.each { |e|
73
+ next if e.marked?
74
+ e.mark += 1
75
+ s << e.head if e.head.incoming.reject{ |f| f.mark==f.arity }.empty?
76
+ }
77
+ end
78
+ return sorted
79
+ end
80
+
81
+ def HG::init nodes, semiring, root
82
+ nodes.each { |n| n.score=semiring.null }
83
+ root.score = semiring.one
84
+ end
85
+
86
+ def HG::viterbi hypergraph, root, semiring=ViterbiSemiring.new
87
+ toposorted = topological_sort hypergraph.nodes
88
+ init toposorted, semiring, root
89
+ toposorted.each { |n|
90
+ n.incoming.each { |e|
91
+ s = semiring.one
92
+ e.tails.each { |m|
93
+ s = semiring.multiply.call(s, m.score)
94
+ }
95
+ n.score = semiring.add.call(n.score, semiring.multiply.call(s, e.weight))
96
+ }
97
+ }
98
+ end
99
+
100
+ def HG::viterbi_path hypergraph, root, semiring=ViterbiSemiring.new
101
+ toposorted = topological_sort hypergraph.nodes
102
+ init toposorted, semiring, root
103
+ best_path = []
104
+ toposorted.each { |n|
105
+ best_edge = nil
106
+ n.incoming.each { |e|
107
+ s = semiring.one
108
+ e.tails.each { |m|
109
+ s = semiring.multiply.call(s, m.score)
110
+ }
111
+ if n.score < semiring.multiply.call(s, e.weight) # ViterbiSemiring add
112
+ best_edge = e
113
+ end
114
+ n.score = semiring.add.call(n.score, semiring.multiply.call(s, e.weight))
115
+ }
116
+ best_path << best_edge
117
+ }
118
+ return best_path, toposorted.last.score
119
+ end
120
+
121
+ def HG::read_hypergraph_from_json fn, semiring=RealSemiring.new, log_weights=false
122
+ nodes = []
123
+ edges = []
124
+ nodes_by_label = {}
125
+ nodes_by_index = []
126
+ h = JSON.parse File.new(fn).read
127
+ w = SparseVector.from_h h['weights']
128
+ h['nodes'].each { |i|
129
+ n = Node.new i['label'], i['cat']
130
+ nodes << n
131
+ nodes_by_label[n.label] = n
132
+ nodes_by_index << n
133
+ }
134
+ h['edges'].each { |i|
135
+ e = Hyperedge.new(nodes_by_label[i['head']], \
136
+ i['tails'].map{|j| nodes_by_label[j]}.to_a, \
137
+ semiring.convert.call(i['weight'].to_f), \
138
+ {})
139
+ e.f = SparseVector.from_h i['f']
140
+ if log_weights
141
+ e.weight = Math.exp(w.dot(e.f))
142
+ else
143
+ e.weight = w.dot(e.f)
144
+ end
145
+ e.tails.each { |m|
146
+ m.outgoing << e
147
+ }
148
+ e.head.incoming << e
149
+ edges << e
150
+ }
151
+ return Hypergraph.new(nodes, edges), nodes_by_label, nodes_by_index
152
+ end
153
+
154
+ def HG::all_paths hypergraph, root, semiring=ViterbiSemiring.new
155
+ toposorted = topological_sort hypergraph.nodes
156
+ paths = [[]]
157
+ toposorted.each { |n|
158
+ next if n.incoming.empty?
159
+ new_paths = []
160
+ while !paths.empty?
161
+ p = paths.pop
162
+ n.incoming.each { |e|
163
+ new_paths << p+[e]
164
+ }
165
+ end
166
+ paths = new_paths
167
+ }
168
+ return paths
169
+ end
170
+
171
+
172
+ end #module
173
+
data/lib/zipf/misc.rb ADDED
@@ -0,0 +1,114 @@
1
+ require 'timeout'
2
+
3
+
4
+ class Array
5
+ def max_index
6
+ self.index(self.max)
7
+ end
8
+
9
+ def is_subset_of? other
10
+ self.each { |i|
11
+ if other.include? i
12
+ return false
13
+ end
14
+ }
15
+ return true
16
+ end
17
+
18
+ def sum
19
+ self.inject(:+)
20
+ end
21
+
22
+ def mean
23
+ self.sum.to_f/self.size
24
+ end
25
+ end
26
+
27
+ class String
28
+
29
+ def downcase?
30
+ self[/[[:lower:]]/]
31
+ end
32
+ end
33
+
34
+ class PriorityQueue
35
+ # This assumes that elements in the queue
36
+ # have a numerical member named 'score'.
37
+
38
+ def initialize a=Array.new
39
+ @queue = Array.new a
40
+ sort!
41
+ end
42
+
43
+ def sort!
44
+ @queue.sort_by! { |i| -i.score }
45
+ end
46
+
47
+ def pop
48
+ @queue.pop
49
+ end
50
+
51
+ def push i
52
+ @queue << i
53
+ sort!
54
+ end
55
+
56
+ def empty?
57
+ @queue.empty?
58
+ end
59
+ end
60
+
61
+ def spawn_with_timeout cmd, t=4, ignore_fail=false, debug=false
62
+ STDERR.write cmd+"\n" if debug
63
+ pipe_in, pipe_out = IO.pipe
64
+ pid = Process.spawn(cmd, :out => pipe_out)
65
+ begin
66
+ Timeout.timeout(t) { Process.wait pid }
67
+ rescue Timeout::Error
68
+ Process.kill('TERM', pid) if !ignore_fail
69
+ end
70
+ pipe_out.close
71
+ return pipe_in.read
72
+ end
73
+
74
+ def read_phrase_table fn
75
+ table = {}
76
+ f = ReadFile.new fn
77
+ while raw_rule = f.gets
78
+ french, english, features = splitpipe(raw_rule)
79
+ feature_map = SparseVector.from_kv features
80
+ if table.has_key? french
81
+ table[french] << [english, feature_map ]
82
+ else
83
+ table[french] = [[english, feature_map]]
84
+ end
85
+ end
86
+ f.close
87
+ return table
88
+ end
89
+
90
+ def cdec_kbest cdec_bin, input, ini, weights, k, unique=true
91
+ require 'open3'
92
+ cmd = "echo \"#{input}\" | #{cdec_bin} -c #{ini} -w #{weights} -k #{k}"
93
+ cmd += " -r" if unique
94
+ o,_ = Open3.capture2 "#{cmd} 2>/dev/null"
95
+ a = []; j = -1
96
+ o.split("\n").map{ |i| j+=1; t=Translation.new; t.from_s(i, false, j); a << t }
97
+ return a
98
+ end
99
+
100
+ def read_config fn
101
+ f = ReadFile.new fn
102
+ cfg = {}
103
+ while line = f.gets
104
+ line.strip!
105
+ next if /^\s*$/.match line
106
+ next if line[0]=='#'
107
+ content = line.split('#', 2).first
108
+ k, v = content.split(/\s*=\s*/, 2)
109
+ k.strip!; v.strip!
110
+ cfg[k] = v
111
+ end
112
+ return cfg
113
+ end
114
+
@@ -0,0 +1,81 @@
1
+ # Semirings for directed acyclic graphs (dags) (also directed hypergraphs),
2
+ # as described in:
3
+ # 'Dynamic Programming Algorithms in
4
+ # Semiring and Hypergraph Frameworks' (Liang Huang)
5
+ #
6
+
7
+ class Semiring
8
+ attr_accessor :add, :multiply, :one, :null, :convert
9
+ end
10
+
11
+ class BooleanSemiring < Semiring
12
+ def initialize
13
+ @add = Proc.new { |a,b| a||b }
14
+ @multiply = Proc.new { |a,b| a&&b }
15
+ @one = true
16
+ @null = false
17
+ @convert = Proc.new { |v| true && v!=0 }
18
+ end
19
+ end
20
+
21
+ class ViterbiSemiring < Semiring
22
+ def initialize
23
+ @add = Proc.new { |a,b| [a,b].max }
24
+ @multiply = Proc.new { |a,b| a*b }
25
+ @one = 1.0
26
+ @null = 0.0
27
+ @convert = Proc.new { |v| v }
28
+ end
29
+ end
30
+
31
+ class ViterbiLogSemiring < Semiring
32
+ def initialize
33
+ @add = Proc.new { |a,b| [a,b].max }
34
+ @multiply = Proc.new { |a,b| a+b }
35
+ @one = 0.0
36
+ @null = -1.0/0.0
37
+ @convert = Proc.new { |v| v }
38
+ end
39
+ end
40
+
41
+ class InsideSemiring < Semiring
42
+ def initialize
43
+ @add = Proc.new { |a,b| a+b }
44
+ @multiply = Proc.new { |a,b| a*b }
45
+ @one = 1.0
46
+ @null = 0.0
47
+ @convert = Proc.new { |v| v }
48
+ end
49
+ end
50
+
51
+ class RealSemiring < Semiring
52
+ def initialize
53
+ @add = Proc.new { |a,b| [a,b].min }
54
+ @multiply = Proc.new { |a,b| a+b }
55
+ @one = 0.0
56
+ @null = 1.0/0.0
57
+ @convert = Proc.new { |v| v }
58
+ end
59
+ end
60
+
61
+ # for longest/worst paths
62
+ class RealxSemiring < Semiring
63
+ def initialize
64
+ @add = Proc.new { |a,b| [a,b].max }
65
+ @multiply = Proc.new { |a,b| a+b }
66
+ @one = -1.0/0.0
67
+ @null = 0.0
68
+ @convert = Proc.new { |v| v }
69
+ end
70
+ end
71
+
72
+ class CountingSemiring < Semiring
73
+ def initialize
74
+ @add = Proc.new { |a,b| a+b }
75
+ @multiply = Proc.new { |a,b| a*b }
76
+ @one = 1.0
77
+ @null = 0.0
78
+ @convert = Proc.new { |v| if v!=0 then 1 else 0 end }
79
+ end
80
+ end
81
+
@@ -0,0 +1,22 @@
1
+ def tokenize s
2
+ s.strip.split
3
+ end
4
+
5
+ def ngrams(s, n, fix=false)
6
+ a = tokenize s
7
+ a.each_with_index { |tok, i|
8
+ tok.strip!
9
+ 0.upto([n-1, a.size-i-1].min) { |m|
10
+ yield a[i..i+m] if !fix||(fix&&a[i..i+m].size==n)
11
+ }
12
+ }
13
+ end
14
+
15
+ def bag_of_words s, stopwords=[]
16
+ s.strip.split.uniq.sort.reject{ |w| stopwords.include? w }
17
+ end
18
+
19
+ def splitpipe s, n=3
20
+ s.strip.split("|"*n)
21
+ end
22
+
data/lib/zipf/tfidf.rb ADDED
@@ -0,0 +1,38 @@
1
+ module TFIDF
2
+
3
+
4
+ # returns key='raw frequency' for an
5
+ # array-like object
6
+ def TFIDF::tf array, stopwords=[]
7
+ v = {}; v.default = 0
8
+ array.uniq.each { |i|
9
+ next if stopwords.include? i
10
+ v[i] = array.count(i).to_f
11
+ }
12
+ return v
13
+ end
14
+
15
+ # smoothes raw frequencies of tf() in-place
16
+ # a is a smoothing term
17
+ def TFIDF::ntf hash, a=0.4
18
+ max = hash.values.max.to_f
19
+ hash.each_pair { |k,v|
20
+ hash[k] = a + (1-a)*(v/max)
21
+ }
22
+ end
23
+
24
+ # returns idf value for each word in a vocabulary
25
+ def TFIDF::idf list_of_hashes
26
+ vocab = list_of_hashes.values.flatten.uniq
27
+ n = list_of_hashes.size.to_f
28
+ idf = {}
29
+ vocab.each { |i|
30
+ df = list_of_hashes.values.flatten.count i
31
+ idf[i] = Math.log(n/df)
32
+ }
33
+ return idf
34
+ end
35
+
36
+
37
+ end #module
38
+
metadata ADDED
@@ -0,0 +1,55 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: zipf
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Patrick Simianer
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-06-16 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: NLP related tools and classes
14
+ email: p@simianer.de
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/zipf.rb
20
+ - lib/zipf/stringutil.rb
21
+ - lib/zipf/misc.rb
22
+ - lib/zipf/grammar.rb
23
+ - lib/zipf/hg.rb
24
+ - lib/zipf/fileutil.rb
25
+ - lib/zipf/semirings.rb
26
+ - lib/zipf/dag.rb
27
+ - lib/zipf/SparseVector.rb
28
+ - lib/zipf/tfidf.rb
29
+ - lib/zipf/bleu.rb
30
+ - lib/zipf/Translation.rb
31
+ homepage: http://simianer.de
32
+ licenses:
33
+ - MIT
34
+ metadata: {}
35
+ post_install_message:
36
+ rdoc_options: []
37
+ require_paths:
38
+ - lib
39
+ required_ruby_version: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ required_rubygems_version: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - '>='
47
+ - !ruby/object:Gem::Version
48
+ version: '0'
49
+ requirements: []
50
+ rubyforge_project:
51
+ rubygems_version: 2.0.3
52
+ signing_key:
53
+ specification_version: 4
54
+ summary: zipf
55
+ test_files: []