zipf 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/zipf.rb +18 -0
- data/lib/zipf/SparseVector.rb +172 -0
- data/lib/zipf/Translation.rb +72 -0
- data/lib/zipf/bleu.rb +130 -0
- data/lib/zipf/dag.rb +205 -0
- data/lib/zipf/fileutil.rb +88 -0
- data/lib/zipf/grammar.rb +123 -0
- data/lib/zipf/hg.rb +173 -0
- data/lib/zipf/misc.rb +114 -0
- data/lib/zipf/semirings.rb +81 -0
- data/lib/zipf/stringutil.rb +22 -0
- data/lib/zipf/tfidf.rb +38 -0
- metadata +55 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: e8021bb7a07d98332028ec75ff1c3bf53149cab3
|
4
|
+
data.tar.gz: 958c53844c7e0b1b76b44bcc26fe60736449a1cd
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: c444514cec3f6154c9011db7aac92b579e046dea3c88e24781db728f46cb67c1e789c50007edbc4dfe202c448a86b158f170ac3a6baf5cd4cae4ff5fd422b5c7
|
7
|
+
data.tar.gz: 43c54fa8adf44ef26d0894bb00a8de9e8ae30cc79efacbf40d20f7c8bc116a6cbecd2f641ccc23c582ddb4e8dc69627958d7b79c57f6fd1cd5f250fed3df6c50
|
data/lib/zipf.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'zipf/stringutil'
|
4
|
+
require 'zipf/fileutil'
|
5
|
+
require 'zipf/SparseVector'
|
6
|
+
require 'zipf/tfidf'
|
7
|
+
require 'zipf/Translation'
|
8
|
+
require 'zipf/dag'
|
9
|
+
require 'zipf/semirings'
|
10
|
+
require 'zipf/bleu'
|
11
|
+
require 'zipf/misc'
|
12
|
+
require 'zipf/hg'
|
13
|
+
require 'zipf/grammar'
|
14
|
+
|
15
|
+
STDIN.set_encoding 'utf-8'
|
16
|
+
STDOUT.set_encoding 'utf-8'
|
17
|
+
STDERR.set_encoding 'utf-8'
|
18
|
+
|
@@ -0,0 +1,172 @@
|
|
1
|
+
class SparseVector < Hash
|
2
|
+
|
3
|
+
def initialize arg=nil
|
4
|
+
super
|
5
|
+
self.default = 0
|
6
|
+
if arg.is_a? Array
|
7
|
+
from_a arg
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
def from_a a
|
12
|
+
a.each_with_index { |i,j| self[j] = i }
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.from_a a
|
16
|
+
v = SparseVector.new
|
17
|
+
v.from_a a
|
18
|
+
return v
|
19
|
+
end
|
20
|
+
|
21
|
+
def from_h h
|
22
|
+
h.each_pair { |k,v| self[k] = v }
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.from_h h
|
26
|
+
v = SparseVector.new
|
27
|
+
v.from_h h
|
28
|
+
return v
|
29
|
+
end
|
30
|
+
|
31
|
+
def from_s s
|
32
|
+
from_h eval(s)
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.from_s s
|
36
|
+
v = SparseVector.new
|
37
|
+
v.from_s s
|
38
|
+
return v
|
39
|
+
end
|
40
|
+
|
41
|
+
def to_kv sep='=', join=' '
|
42
|
+
a = []
|
43
|
+
self.each_pair { |k,v|
|
44
|
+
a << "#{k}#{sep}#{v}"
|
45
|
+
}
|
46
|
+
return a.join join
|
47
|
+
end
|
48
|
+
|
49
|
+
def from_kv s
|
50
|
+
s.split.each { |i|
|
51
|
+
k,v = i.split('=')
|
52
|
+
self[k] = v.to_f
|
53
|
+
}
|
54
|
+
end
|
55
|
+
|
56
|
+
def self.from_kv s
|
57
|
+
v = SparseVector.new
|
58
|
+
v.from_kv s
|
59
|
+
return v
|
60
|
+
end
|
61
|
+
|
62
|
+
def from_file fn, sep='='
|
63
|
+
f = ReadFile.new(fn)
|
64
|
+
while line = f.gets
|
65
|
+
key, value = line.strip.split sep
|
66
|
+
value = value.to_f
|
67
|
+
self[key] = value
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def self.from_file fn, sep='='
|
72
|
+
v = SparseVector.new
|
73
|
+
v.from_file fn, sep
|
74
|
+
return v
|
75
|
+
end
|
76
|
+
|
77
|
+
def join_keys other
|
78
|
+
self.keys + other.keys
|
79
|
+
end
|
80
|
+
|
81
|
+
def sum
|
82
|
+
self.values.inject(:+)
|
83
|
+
end
|
84
|
+
|
85
|
+
def approx_eql? other, p=10**-10
|
86
|
+
return false if !other
|
87
|
+
return false if other.size!=self.size
|
88
|
+
return false if other.keys.sort!=self.keys.sort
|
89
|
+
self.keys.each { |k|
|
90
|
+
return false if (self[k]-other[k]).abs>p
|
91
|
+
}
|
92
|
+
return true
|
93
|
+
end
|
94
|
+
|
95
|
+
def average
|
96
|
+
self.sum/self.size.to_f
|
97
|
+
end
|
98
|
+
|
99
|
+
def variance
|
100
|
+
avg = self.average
|
101
|
+
var = 0.0
|
102
|
+
self.values.each { |i| var += (avg - i)**2 }
|
103
|
+
return var
|
104
|
+
end
|
105
|
+
|
106
|
+
def stddev
|
107
|
+
Math.sqrt self.variance
|
108
|
+
end
|
109
|
+
|
110
|
+
def dot other
|
111
|
+
sum = 0.0
|
112
|
+
self.each_pair { |k,v| sum += v * other[k] }
|
113
|
+
return sum
|
114
|
+
end
|
115
|
+
|
116
|
+
def zeros n
|
117
|
+
(0).upto(n-1) { |i| self[i] = 0.0 }
|
118
|
+
end
|
119
|
+
|
120
|
+
def magnitude
|
121
|
+
Math.sqrt self.values.inject { |sum,i| sum+i**2 }
|
122
|
+
end
|
123
|
+
|
124
|
+
def cosinus_sim other
|
125
|
+
self.dot(other)/(self.magnitude*other.magnitude)
|
126
|
+
end
|
127
|
+
|
128
|
+
def euclidian_dist other
|
129
|
+
dims = [self.keys, other.keys].flatten.uniq
|
130
|
+
sum = 0.0
|
131
|
+
dims.each { |d| sum += (self[d] - other[d])**2 }
|
132
|
+
return Math.sqrt(sum)
|
133
|
+
end
|
134
|
+
|
135
|
+
def + other
|
136
|
+
new = SparseVector.new
|
137
|
+
join_keys(other).each { |k|
|
138
|
+
new[k] = self[k]+other[k]
|
139
|
+
}
|
140
|
+
return new
|
141
|
+
end
|
142
|
+
|
143
|
+
def - other
|
144
|
+
new = SparseVector.new
|
145
|
+
join_keys(other).each { |k|
|
146
|
+
new[k] = self[k]-other[k]
|
147
|
+
}
|
148
|
+
return new
|
149
|
+
end
|
150
|
+
|
151
|
+
def * scalar
|
152
|
+
raise ArgumentError, "Arg is not numeric #{scalar}" unless scalar.is_a? Numeric
|
153
|
+
new = SparseVector.new
|
154
|
+
self.keys.each { |k|
|
155
|
+
new[k] = self[k] * scalar
|
156
|
+
}
|
157
|
+
return new
|
158
|
+
end
|
159
|
+
|
160
|
+
def self.mean a
|
161
|
+
mean = SparseVector.new
|
162
|
+
a.each { |i|
|
163
|
+
i.each_pair { |k,v|
|
164
|
+
mean[k] += v
|
165
|
+
}
|
166
|
+
}
|
167
|
+
n = a.size.to_f
|
168
|
+
mean.each_pair { |k,v| mean[k] = v/n }
|
169
|
+
return mean
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
@@ -0,0 +1,72 @@
|
|
1
|
+
class Translation
|
2
|
+
attr_accessor :id, :s, :raw, :f, :scores, :rank
|
3
|
+
|
4
|
+
def initialize id=nil, raw=nil, s=nil, f=nil, scores={}, rank=nil
|
5
|
+
@id = id
|
6
|
+
@raw = raw
|
7
|
+
@s = s
|
8
|
+
@f = f
|
9
|
+
@scores = scores
|
10
|
+
@rank = rank
|
11
|
+
end
|
12
|
+
|
13
|
+
def from_s t, strip_alignment=true, rank=nil
|
14
|
+
id, raw, features, score = splitpipe(t, 3)
|
15
|
+
raw.strip!
|
16
|
+
@raw = raw
|
17
|
+
if strip_alignment # the way moses does it
|
18
|
+
@s = @raw.gsub(/\s*\|\d+-\d+\||\|-?\d+\|\s*/, ' ').gsub(/\s+/, ' ')
|
19
|
+
@s.strip!
|
20
|
+
else
|
21
|
+
@s = raw
|
22
|
+
end
|
23
|
+
@id = id.to_i
|
24
|
+
@f = SparseVector.from_kv features
|
25
|
+
@scores[:decoder] = score.to_f
|
26
|
+
@rank = rank
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.from_s s
|
30
|
+
t = self.new
|
31
|
+
t.from_s s
|
32
|
+
return t
|
33
|
+
end
|
34
|
+
|
35
|
+
def to_s include_features=true
|
36
|
+
[@id, @s, @f.to_kv('=', ' '), @scores[:decoder]].join(' ||| ') if include_features
|
37
|
+
[@id, @s, @scores[:decoder]].join(' ||| ') if !include_features
|
38
|
+
end
|
39
|
+
|
40
|
+
def to_s2
|
41
|
+
[@rank, @s, @score, @scores.to_s].join ' ||| '
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def read_kbest_lists fn, translation_type=Translation
|
46
|
+
kbest_lists = []
|
47
|
+
cur = []
|
48
|
+
f = ReadFile.new fn
|
49
|
+
prev = -1
|
50
|
+
c = 0
|
51
|
+
id = 0
|
52
|
+
while line = f.gets
|
53
|
+
t = translation_type.new
|
54
|
+
t.from_s line
|
55
|
+
c = splitpipe(line)[0].to_i
|
56
|
+
if c != prev
|
57
|
+
if cur.size > 0
|
58
|
+
kbest_lists << cur
|
59
|
+
cur = []
|
60
|
+
end
|
61
|
+
prev = c
|
62
|
+
id = 0
|
63
|
+
end
|
64
|
+
t.id = id
|
65
|
+
cur << t
|
66
|
+
id += 1
|
67
|
+
end
|
68
|
+
kbest_lists << cur # last one
|
69
|
+
f.close
|
70
|
+
return kbest_lists
|
71
|
+
end
|
72
|
+
|
data/lib/zipf/bleu.rb
ADDED
@@ -0,0 +1,130 @@
|
|
1
|
+
module BLEU
|
2
|
+
|
3
|
+
|
4
|
+
class BLEU::NgramCounts
|
5
|
+
attr_accessor :sum, :clipped, :ref_len, :hyp_len, :n
|
6
|
+
|
7
|
+
def initialize(n)
|
8
|
+
@n = 0
|
9
|
+
@sum = []
|
10
|
+
@clipped = []
|
11
|
+
@ref_len = 0.0
|
12
|
+
@hyp_len = 0.0
|
13
|
+
grow(n)
|
14
|
+
end
|
15
|
+
|
16
|
+
def grow(n)
|
17
|
+
(n-@n).times {
|
18
|
+
@sum << 0.0
|
19
|
+
@clipped << 0.0
|
20
|
+
}
|
21
|
+
@n = n
|
22
|
+
end
|
23
|
+
|
24
|
+
def plus_eq(other)
|
25
|
+
if other.n > @n then grow(other.n) end
|
26
|
+
0.upto(other.n-1) { |m|
|
27
|
+
@sum[m] += other.sum[m]
|
28
|
+
@clipped[m] += other.clipped[m]
|
29
|
+
}
|
30
|
+
@ref_len += other.ref_len
|
31
|
+
@hyp_len += other.hyp_len
|
32
|
+
end
|
33
|
+
|
34
|
+
def to_s
|
35
|
+
return "n=#{n} sum=#{sum} clipped=#{clipped} ref_len=#{ref_len} hyp_len=#{hyp_len}"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
class BLEU::Ngrams
|
40
|
+
def initialize
|
41
|
+
@h_ = {}
|
42
|
+
@h_.default = 0
|
43
|
+
end
|
44
|
+
|
45
|
+
def add(k)
|
46
|
+
if k.class == Array then k = k.join ' ' end
|
47
|
+
@h_[k] += 1
|
48
|
+
end
|
49
|
+
|
50
|
+
def get_count(k)
|
51
|
+
if k.class == Array then k = k.join ' ' end
|
52
|
+
return @h_[k]
|
53
|
+
end
|
54
|
+
|
55
|
+
def each
|
56
|
+
@h_.each_pair { |k,v|
|
57
|
+
yield k.split, v
|
58
|
+
}
|
59
|
+
end
|
60
|
+
|
61
|
+
def to_s
|
62
|
+
@h_.to_s
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def BLEU::get_counts hypothesis, reference, n, times=1
|
67
|
+
p = NgramCounts.new n
|
68
|
+
r = Ngrams.new
|
69
|
+
ngrams(reference, n) { |ng| r.add ng }
|
70
|
+
h = Ngrams.new
|
71
|
+
ngrams(hypothesis, n) { |ng| h.add ng }
|
72
|
+
h.each { |ng,count|
|
73
|
+
sz = ng.size-1
|
74
|
+
p.sum[sz] += count * times
|
75
|
+
p.clipped[sz] += [r.get_count(ng), count].min * times
|
76
|
+
}
|
77
|
+
p.ref_len = tokenize(reference.strip).size * times
|
78
|
+
p.hyp_len = tokenize(hypothesis.strip).size * times
|
79
|
+
return p
|
80
|
+
end
|
81
|
+
|
82
|
+
def BLEU::brevity_penalty c, r, smooth=0.0
|
83
|
+
return [0.0, 1.0-((r+smooth)/c)].min
|
84
|
+
end
|
85
|
+
|
86
|
+
def BLEU::bleu counts, n, debug=false
|
87
|
+
corpus_stats = NgramCounts.new n
|
88
|
+
counts.each { |i| corpus_stats.plus_eq i }
|
89
|
+
logbleu = 0.0
|
90
|
+
0.upto(n-1) { |m|
|
91
|
+
STDERR.write "#{m+1} #{corpus_stats.clipped[m]} / #{corpus_stats.sum[m]}\n" if debug
|
92
|
+
return 0.0 if corpus_stats.clipped[m] == 0 or corpus_stats.sum == 0
|
93
|
+
logbleu += Math.log(corpus_stats.clipped[m]) - Math.log(corpus_stats.sum[m])
|
94
|
+
}
|
95
|
+
logbleu /= n
|
96
|
+
if debug
|
97
|
+
STDERR.write "BP #{brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len)}\n"
|
98
|
+
STDERR.write "sum #{Math.exp(sum)}\n"
|
99
|
+
end
|
100
|
+
logbleu += brevity_penalty corpus_stats.hyp_len, corpus_stats.ref_len
|
101
|
+
return Math.exp logbleu
|
102
|
+
end
|
103
|
+
|
104
|
+
def BLEU::hbleu counts, n, debug=false
|
105
|
+
(100*bleu(counts, n, debug)).round(3)
|
106
|
+
end
|
107
|
+
|
108
|
+
def BLEU::per_sentence_bleu hypothesis, reference, n=4, smooth=0.0
|
109
|
+
h_ng = {}; r_ng = {}
|
110
|
+
(1).upto(n) { |i| h_ng[i] = []; r_ng[i] = [] }
|
111
|
+
ngrams(hypothesis, n) { |i| h_ng[i.size] << i }
|
112
|
+
ngrams(reference, n) { |i| r_ng[i.size] << i }
|
113
|
+
m = [n, reference.split.size].min
|
114
|
+
add = 0.0
|
115
|
+
logbleu = 0.0
|
116
|
+
(1).upto(m) { |i|
|
117
|
+
counts_clipped = 0
|
118
|
+
counts_sum = h_ng[i].size
|
119
|
+
h_ng[i].uniq.each { |j| counts_clipped += r_ng[i].count(j) }
|
120
|
+
add = 1.0 if i >= 2
|
121
|
+
logbleu += Math.log(counts_clipped+add) - Math.log(counts_sum+add);
|
122
|
+
}
|
123
|
+
logbleu /= m
|
124
|
+
logbleu += brevity_penalty hypothesis.strip.split.size, reference.strip.split.size, smooth
|
125
|
+
return Math.exp logbleu
|
126
|
+
end
|
127
|
+
|
128
|
+
|
129
|
+
end #module
|
130
|
+
|
data/lib/zipf/dag.rb
ADDED
@@ -0,0 +1,205 @@
|
|
1
|
+
module DAG
|
2
|
+
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
|
6
|
+
class DAG::Node
|
7
|
+
attr_accessor :label, :outgoing, :incoming, :score, :mark
|
8
|
+
|
9
|
+
def initialize label=nil, outgoing=[], incoming=[], score=nil
|
10
|
+
@label = label
|
11
|
+
@outgoing = outgoing
|
12
|
+
@incoming = incoming
|
13
|
+
@score = nil
|
14
|
+
end
|
15
|
+
|
16
|
+
def add_edge head, weight=0
|
17
|
+
exit if self==head # no self-cycles!
|
18
|
+
@outgoing << DAG::Edge.new(self, head, weight)
|
19
|
+
return @outgoing.last
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_s
|
23
|
+
"DAG::Node<label:#{label}, outgoing:#{outgoing.size}, incoming:#{incoming.size}>"
|
24
|
+
end
|
25
|
+
|
26
|
+
def repr
|
27
|
+
"#{to_s} #{@score} out:#{@outgoing} in:[#{@incoming.map{|e| e.to_s}.join ', '}]"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
class DAG::Edge
|
32
|
+
attr_accessor :tail, :head, :weight, :mark
|
33
|
+
|
34
|
+
def initialize tail=nil, head=nil, weight=0
|
35
|
+
@tail = tail
|
36
|
+
@head = head
|
37
|
+
@weight = weight
|
38
|
+
@mark = false # did we already follow this edge? -- for topological sorting
|
39
|
+
end
|
40
|
+
|
41
|
+
def to_s
|
42
|
+
s = "DAG::Edge<#{@tail} ->[#{weight}] #{@head}"
|
43
|
+
s += " x" if @mark
|
44
|
+
s += ">"
|
45
|
+
s
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# depth-first search
|
50
|
+
# w/o markings as we do not have cycles
|
51
|
+
def DAG::dfs n, target_label
|
52
|
+
return n if n.label==target_label # assumes uniq labels!
|
53
|
+
stack = n.outgoing.map { |i| i.head }
|
54
|
+
while !stack.empty?
|
55
|
+
m = stack.pop
|
56
|
+
return DAG::dfs m, target_label
|
57
|
+
end
|
58
|
+
return nil
|
59
|
+
end
|
60
|
+
|
61
|
+
# breadth-first search
|
62
|
+
# w/o markings as we do not have cycles
|
63
|
+
def DAG::bfs n, target_label
|
64
|
+
queue = [n]
|
65
|
+
while !queue.empty?
|
66
|
+
m = queue.shift
|
67
|
+
return m if m.label==target_label
|
68
|
+
m.outgoing.each { |e| queue << e.head }
|
69
|
+
end
|
70
|
+
return nil
|
71
|
+
end
|
72
|
+
|
73
|
+
# topological sort
|
74
|
+
def DAG::topological_sort graph
|
75
|
+
sorted = []
|
76
|
+
s = graph.reject { |n| !n.incoming.empty? }
|
77
|
+
while !s.empty?
|
78
|
+
sorted << s.shift
|
79
|
+
sorted.last.outgoing.each { |e|
|
80
|
+
e.mark = true
|
81
|
+
s << e.head if e.head.incoming.reject{|f| f.mark}.empty?
|
82
|
+
}
|
83
|
+
end
|
84
|
+
return sorted
|
85
|
+
end
|
86
|
+
|
87
|
+
# initialize graph scores with semiring One
|
88
|
+
def DAG::init graph, semiring, source_node
|
89
|
+
graph.each {|n| n.score=semiring.null}
|
90
|
+
source_node.score = semiring.one
|
91
|
+
end
|
92
|
+
|
93
|
+
# viterbi
|
94
|
+
def DAG::viterbi graph, semiring=ViterbiSemiring, source_node
|
95
|
+
toposorted = DAG::topological_sort(graph)
|
96
|
+
DAG::init(graph, semiring, source_node)
|
97
|
+
toposorted.each { |n|
|
98
|
+
n.incoming.each { |e|
|
99
|
+
# update
|
100
|
+
n.score = \
|
101
|
+
semiring.add.call(n.score, \
|
102
|
+
semiring.multiply.call(e.tail.score, e.weight)
|
103
|
+
)
|
104
|
+
}
|
105
|
+
}
|
106
|
+
end
|
107
|
+
|
108
|
+
# forward viterbi
|
109
|
+
def DAG::viterbi_forward graph, semiring=ViterbiSemiring, source_node
|
110
|
+
toposorted = DAG::topological_sort(graph)
|
111
|
+
DAG::init(graph, semiring, source_node)
|
112
|
+
toposorted.each { |n|
|
113
|
+
n.outgoing.each { |e|
|
114
|
+
e.head.score = \
|
115
|
+
semiring.add.call(e.head.score, \
|
116
|
+
semiring.multiply.call(n.score, e.weight)
|
117
|
+
)
|
118
|
+
}
|
119
|
+
}
|
120
|
+
end
|
121
|
+
|
122
|
+
# Dijkstra algorithm
|
123
|
+
# for A*-search we would need an optimistic estimate of
|
124
|
+
# future cost at each node
|
125
|
+
def DAG::dijkstra graph, semiring=RealSemiring.new, source_node
|
126
|
+
DAG::init(graph, semiring, source_node)
|
127
|
+
q = PriorityQueue.new graph
|
128
|
+
while !q.empty?
|
129
|
+
n = q.pop
|
130
|
+
n.outgoing.each { |e|
|
131
|
+
e.head.score = \
|
132
|
+
semiring.add.call(e.head.score, \
|
133
|
+
semiring.multiply.call(n.score, e.weight))
|
134
|
+
q.sort!
|
135
|
+
}
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
# Bellman-Ford algorithm
|
140
|
+
def DAG::bellman_ford(graph, semiring=RealSemiring.new, source_node)
|
141
|
+
DAG::init(graph, semiring, source_node)
|
142
|
+
edges = []
|
143
|
+
graph.each { |n| edges |= n.outgoing }
|
144
|
+
# relax edges
|
145
|
+
(graph.size-1).times{ |i|
|
146
|
+
edges.each { |e|
|
147
|
+
e.head.score = \
|
148
|
+
semiring.add.call(e.head.score, \
|
149
|
+
semiring.multiply.call(e.tail.score, e.weight))
|
150
|
+
}
|
151
|
+
}
|
152
|
+
# we do not allow cycles (negative or positive)
|
153
|
+
end
|
154
|
+
|
155
|
+
# Floyd algorithm
|
156
|
+
def DAG::floyd(graph, semiring=nil)
|
157
|
+
dist_matrix = []
|
158
|
+
graph.each_index { |i|
|
159
|
+
dist_matrix << []
|
160
|
+
graph.each_index { |j|
|
161
|
+
val = 1.0/0.0
|
162
|
+
val = 0.0 if i==j
|
163
|
+
dist_matrix.last << val
|
164
|
+
}
|
165
|
+
}
|
166
|
+
edges = []
|
167
|
+
graph.each { |n| edges |= n.outgoing }
|
168
|
+
edges.each { |e|
|
169
|
+
dist_matrix[graph.index(e.tail)][graph.index(e.head)] = e.weight
|
170
|
+
}
|
171
|
+
0.upto(graph.size-1) { |k|
|
172
|
+
0.upto(graph.size-1) { |i|
|
173
|
+
0.upto(graph.size-1) { |j|
|
174
|
+
if dist_matrix[i][k] + dist_matrix[k][j] < dist_matrix[i][j]
|
175
|
+
dist_matrix [i][j] = dist_matrix[i][k] + dist_matrix[k][j]
|
176
|
+
end
|
177
|
+
}
|
178
|
+
}
|
179
|
+
}
|
180
|
+
return dist_matrix
|
181
|
+
end
|
182
|
+
|
183
|
+
|
184
|
+
# returns a list of nodes (graph) and a hash for finding
|
185
|
+
# nodes by their label (these need to be unique!)
|
186
|
+
def DAG::read_graph_from_json fn, semiring=RealSemiring.new
|
187
|
+
graph = []
|
188
|
+
nodes_by_label = {}
|
189
|
+
h = JSON.parse File.new(fn).read
|
190
|
+
h['nodes'].each { |i|
|
191
|
+
n = DAG::Node.new i['label']
|
192
|
+
graph << n
|
193
|
+
nodes_by_label[n.label] = n
|
194
|
+
}
|
195
|
+
h['edges'].each { |i|
|
196
|
+
n = nodes_by_label[i['tail']]
|
197
|
+
a = n.add_edge(nodes_by_label[i['head']], semiring.convert.call(i['weight'].to_f))
|
198
|
+
nodes_by_label[i['head']].incoming << a
|
199
|
+
}
|
200
|
+
return graph, nodes_by_label
|
201
|
+
end
|
202
|
+
|
203
|
+
|
204
|
+
end #module
|
205
|
+
|
@@ -0,0 +1,88 @@
|
|
1
|
+
require 'zlib'
|
2
|
+
|
3
|
+
|
4
|
+
class ReadFile
|
5
|
+
|
6
|
+
def initialize fn, encoding='utf-8'
|
7
|
+
if fn.split('.').last == 'gz'
|
8
|
+
@f = Zlib::GzipReader.new(File.new(fn, 'rb'), :external_encoding=>encoding)
|
9
|
+
elsif fn == '-'
|
10
|
+
@f = STDIN
|
11
|
+
STDIN.set_encoding encoding
|
12
|
+
else
|
13
|
+
@f = File.new fn, 'r'
|
14
|
+
@f.set_encoding encoding
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def gets
|
19
|
+
@f.gets { |line| yield line }
|
20
|
+
end
|
21
|
+
|
22
|
+
def readlines
|
23
|
+
@f.readlines
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.readlines fn, encoding='utf-8'
|
27
|
+
f = ReadFile.new fn, encoding
|
28
|
+
r = f.readlines
|
29
|
+
f.close
|
30
|
+
return r
|
31
|
+
end
|
32
|
+
|
33
|
+
def readlines_strip
|
34
|
+
self.readlines.map{ |i| i.strip }
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.readlines_strip fn, encoding='utf-8'
|
38
|
+
f = ReadFile.new fn, encoding
|
39
|
+
r = f.readlines_strip
|
40
|
+
f.close
|
41
|
+
return r
|
42
|
+
end
|
43
|
+
|
44
|
+
def read
|
45
|
+
@f.read
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.read fn, encoding='utf-8'
|
49
|
+
f = ReadFile.new fn, encoding
|
50
|
+
r = f.read
|
51
|
+
f.close
|
52
|
+
return r
|
53
|
+
end
|
54
|
+
|
55
|
+
def close
|
56
|
+
@f.close if @f!=STDIN
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
class WriteFile
|
61
|
+
|
62
|
+
def initialize fn, encoding='utf-8'
|
63
|
+
if fn.split('.').last == 'gz'
|
64
|
+
@f = Zlib::GzipWriter.new(File.new(fn, 'wb+'), :external_encoding=>encoding)
|
65
|
+
elsif fn == '-'
|
66
|
+
@f = STDOUT
|
67
|
+
STDOUT.set_encoding encoding
|
68
|
+
else
|
69
|
+
@f = File.new fn, 'w+'
|
70
|
+
@f.set_encoding encoding
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def write s
|
75
|
+
@f.write s
|
76
|
+
end
|
77
|
+
|
78
|
+
def self.write s, fn, encoding='utf-8'
|
79
|
+
f = WriteFile.new fn, encoding
|
80
|
+
f.write s
|
81
|
+
f.close
|
82
|
+
end
|
83
|
+
|
84
|
+
def close
|
85
|
+
@f.close if @f!=STDIN
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
data/lib/zipf/grammar.rb
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
module Grammar
|
2
|
+
|
3
|
+
|
4
|
+
class T
|
5
|
+
attr_accessor :word
|
6
|
+
|
7
|
+
def initialize word
|
8
|
+
@word = word
|
9
|
+
end
|
10
|
+
|
11
|
+
def to_s
|
12
|
+
"T<#{@word}>"
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class NT
|
17
|
+
attr_accessor :symbol, :index, :span
|
18
|
+
|
19
|
+
def initialize symbol, index=0
|
20
|
+
@symbol = symbol
|
21
|
+
@index = index
|
22
|
+
@span = Span.new
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_s
|
26
|
+
"NT(#{@span.left},#{@span.right})<#{@symbol},#{@index}>"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
class Rule
|
31
|
+
attr_accessor :lhs, :rhs, :e
|
32
|
+
|
33
|
+
def initialize lhs=nil, rhs=[], e=''
|
34
|
+
@lhs = lhs
|
35
|
+
@rhs = rhs
|
36
|
+
@e = e
|
37
|
+
end
|
38
|
+
|
39
|
+
def to_s
|
40
|
+
"#{lhs} -> #{rhs.map{ |i| i.to_s }.join ' '} [arity=#{arity}] ||| #{@e}"
|
41
|
+
end
|
42
|
+
|
43
|
+
def arity
|
44
|
+
rhs.select { |i| i.class == NT }.size
|
45
|
+
end
|
46
|
+
|
47
|
+
def from_s s
|
48
|
+
_ = splitpipe s, 3
|
49
|
+
@lhs = NT.new _[0].strip.gsub!(/(\[|\])/, "")
|
50
|
+
_[1].split.each { |x|
|
51
|
+
x.strip!
|
52
|
+
if x[0]=='[' && x[x.size-1] == ']'
|
53
|
+
@rhs << NT.new(x.gsub!(/(\[|\])/, "").split(',')[0])
|
54
|
+
else
|
55
|
+
@rhs << T.new(x)
|
56
|
+
end
|
57
|
+
}
|
58
|
+
@e = _[2]
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.from_s s
|
62
|
+
r = self.new
|
63
|
+
r.from_s s
|
64
|
+
return r
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
class Span
|
69
|
+
attr_accessor :left, :right
|
70
|
+
|
71
|
+
def initialize left=nil, right=nil
|
72
|
+
@left = left
|
73
|
+
@right = right
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
class Grammar
|
78
|
+
attr_accessor :rules, :startn, :startt, :flat
|
79
|
+
|
80
|
+
def initialize fn
|
81
|
+
@rules = []; @startn = []; @startt = [] ;@flat = []
|
82
|
+
ReadFile.readlines_strip(fn).each_with_index { |s,i|
|
83
|
+
STDERR.write '.'; STDERR.write " #{i+1}\n" if (i+1)%80==0
|
84
|
+
@rules << Rule.from_s(s)
|
85
|
+
if @rules.last.rhs.first.class == NT
|
86
|
+
@startn << @rules.last
|
87
|
+
else
|
88
|
+
if rules.last.arity == 0
|
89
|
+
@flat << @rules.last
|
90
|
+
else
|
91
|
+
@startt << @rules.last
|
92
|
+
end
|
93
|
+
end
|
94
|
+
}
|
95
|
+
STDERR.write "\n"
|
96
|
+
end
|
97
|
+
|
98
|
+
def to_s
|
99
|
+
s = ''
|
100
|
+
@rules.each { |r| s += r.to_s+"\n" }
|
101
|
+
return s
|
102
|
+
end
|
103
|
+
|
104
|
+
def add_glue_rules
|
105
|
+
@rules.map { |r| r.lhs.symbol }.select { |s| s != 'S' }.uniq.each { |symbol|
|
106
|
+
@rules << Rule.new(NT.new('S'), [NT.new(symbol)])
|
107
|
+
@startn << @rules.last
|
108
|
+
@rules << Rule.new(NT.new('S'), [NT.new('S'), NT.new('X')])
|
109
|
+
@startn << @rules.last
|
110
|
+
}
|
111
|
+
end
|
112
|
+
|
113
|
+
def add_pass_through_rules s
|
114
|
+
s.each { |word|
|
115
|
+
@rules << Rule.new(NT.new('X'), [T.new(word)])
|
116
|
+
@flat << @rules.last
|
117
|
+
}
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
|
122
|
+
end #module
|
123
|
+
|
data/lib/zipf/hg.rb
ADDED
@@ -0,0 +1,173 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative 'semirings'
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
|
7
|
+
module HG
|
8
|
+
|
9
|
+
|
10
|
+
class HG::Node
|
11
|
+
attr_accessor :label, :cat, :outgoing, :incoming, :score
|
12
|
+
|
13
|
+
def initialize label=nil, cat=nil, outgoing=[], incoming=[], score=nil
|
14
|
+
@label = label
|
15
|
+
@cat = cat
|
16
|
+
@outgoing = outgoing
|
17
|
+
@incoming = incoming
|
18
|
+
@score = nil
|
19
|
+
end
|
20
|
+
|
21
|
+
def to_s
|
22
|
+
"Node<label:\"#{@label}\", cat:\"#{@cat}\", outgoing:#{@outgoing.size}, incoming:#{@incoming.size}>"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
class HG::Hypergraph
|
27
|
+
attr_accessor :nodes, :edges
|
28
|
+
|
29
|
+
def initialize nodes=[], edges=[]
|
30
|
+
@nodes = nodes
|
31
|
+
@edges = edges
|
32
|
+
end
|
33
|
+
|
34
|
+
def arity
|
35
|
+
@edges.map { |e| e.arity }.max
|
36
|
+
end
|
37
|
+
|
38
|
+
def to_s
|
39
|
+
"Hypergraph<nodes:[#{@nodes.to_s}], edges:[#{@edges.to_s}], arity:#{arity}>"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
class HG::Hyperedge
|
44
|
+
attr_accessor :head, :tails, :weight, :f, :mark, :rule, :left, :right
|
45
|
+
|
46
|
+
def initialize head=nil, tails=[], weight=0.0, f={}
|
47
|
+
@head = head
|
48
|
+
@tails = tails
|
49
|
+
@weight = weight
|
50
|
+
@f = f
|
51
|
+
@mark = 0
|
52
|
+
end
|
53
|
+
|
54
|
+
def arity
|
55
|
+
return @tails.size
|
56
|
+
end
|
57
|
+
|
58
|
+
def marked?
|
59
|
+
arity == @mark
|
60
|
+
end
|
61
|
+
|
62
|
+
def to_s
|
63
|
+
"Hyperedge<head:\"#{@head.label}\", \"tails:#{@tails.map{|n|n.label}}, arity:#{arity}, weight:#{@weight}, f:#{f.to_s}, mark:#{@mark}>"
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def HG::topological_sort nodes
|
68
|
+
sorted = []
|
69
|
+
s = nodes.reject { |n| !n.incoming.empty? }
|
70
|
+
while !s.empty?
|
71
|
+
sorted << s.shift
|
72
|
+
sorted.last.outgoing.each { |e|
|
73
|
+
next if e.marked?
|
74
|
+
e.mark += 1
|
75
|
+
s << e.head if e.head.incoming.reject{ |f| f.mark==f.arity }.empty?
|
76
|
+
}
|
77
|
+
end
|
78
|
+
return sorted
|
79
|
+
end
|
80
|
+
|
81
|
+
def HG::init nodes, semiring, root
|
82
|
+
nodes.each { |n| n.score=semiring.null }
|
83
|
+
root.score = semiring.one
|
84
|
+
end
|
85
|
+
|
86
|
+
def HG::viterbi hypergraph, root, semiring=ViterbiSemiring.new
|
87
|
+
toposorted = topological_sort hypergraph.nodes
|
88
|
+
init toposorted, semiring, root
|
89
|
+
toposorted.each { |n|
|
90
|
+
n.incoming.each { |e|
|
91
|
+
s = semiring.one
|
92
|
+
e.tails.each { |m|
|
93
|
+
s = semiring.multiply.call(s, m.score)
|
94
|
+
}
|
95
|
+
n.score = semiring.add.call(n.score, semiring.multiply.call(s, e.weight))
|
96
|
+
}
|
97
|
+
}
|
98
|
+
end
|
99
|
+
|
100
|
+
def HG::viterbi_path hypergraph, root, semiring=ViterbiSemiring.new
|
101
|
+
toposorted = topological_sort hypergraph.nodes
|
102
|
+
init toposorted, semiring, root
|
103
|
+
best_path = []
|
104
|
+
toposorted.each { |n|
|
105
|
+
best_edge = nil
|
106
|
+
n.incoming.each { |e|
|
107
|
+
s = semiring.one
|
108
|
+
e.tails.each { |m|
|
109
|
+
s = semiring.multiply.call(s, m.score)
|
110
|
+
}
|
111
|
+
if n.score < semiring.multiply.call(s, e.weight) # ViterbiSemiring add
|
112
|
+
best_edge = e
|
113
|
+
end
|
114
|
+
n.score = semiring.add.call(n.score, semiring.multiply.call(s, e.weight))
|
115
|
+
}
|
116
|
+
best_path << best_edge
|
117
|
+
}
|
118
|
+
return best_path, toposorted.last.score
|
119
|
+
end
|
120
|
+
|
121
|
+
def HG::read_hypergraph_from_json fn, semiring=RealSemiring.new, log_weights=false
|
122
|
+
nodes = []
|
123
|
+
edges = []
|
124
|
+
nodes_by_label = {}
|
125
|
+
nodes_by_index = []
|
126
|
+
h = JSON.parse File.new(fn).read
|
127
|
+
w = SparseVector.from_h h['weights']
|
128
|
+
h['nodes'].each { |i|
|
129
|
+
n = Node.new i['label'], i['cat']
|
130
|
+
nodes << n
|
131
|
+
nodes_by_label[n.label] = n
|
132
|
+
nodes_by_index << n
|
133
|
+
}
|
134
|
+
h['edges'].each { |i|
|
135
|
+
e = Hyperedge.new(nodes_by_label[i['head']], \
|
136
|
+
i['tails'].map{|j| nodes_by_label[j]}.to_a, \
|
137
|
+
semiring.convert.call(i['weight'].to_f), \
|
138
|
+
{})
|
139
|
+
e.f = SparseVector.from_h i['f']
|
140
|
+
if log_weights
|
141
|
+
e.weight = Math.exp(w.dot(e.f))
|
142
|
+
else
|
143
|
+
e.weight = w.dot(e.f)
|
144
|
+
end
|
145
|
+
e.tails.each { |m|
|
146
|
+
m.outgoing << e
|
147
|
+
}
|
148
|
+
e.head.incoming << e
|
149
|
+
edges << e
|
150
|
+
}
|
151
|
+
return Hypergraph.new(nodes, edges), nodes_by_label, nodes_by_index
|
152
|
+
end
|
153
|
+
|
154
|
+
def HG::all_paths hypergraph, root, semiring=ViterbiSemiring.new
|
155
|
+
toposorted = topological_sort hypergraph.nodes
|
156
|
+
paths = [[]]
|
157
|
+
toposorted.each { |n|
|
158
|
+
next if n.incoming.empty?
|
159
|
+
new_paths = []
|
160
|
+
while !paths.empty?
|
161
|
+
p = paths.pop
|
162
|
+
n.incoming.each { |e|
|
163
|
+
new_paths << p+[e]
|
164
|
+
}
|
165
|
+
end
|
166
|
+
paths = new_paths
|
167
|
+
}
|
168
|
+
return paths
|
169
|
+
end
|
170
|
+
|
171
|
+
|
172
|
+
end #module
|
173
|
+
|
data/lib/zipf/misc.rb
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
require 'timeout'
|
2
|
+
|
3
|
+
|
4
|
+
class Array
|
5
|
+
def max_index
|
6
|
+
self.index(self.max)
|
7
|
+
end
|
8
|
+
|
9
|
+
def is_subset_of? other
|
10
|
+
self.each { |i|
|
11
|
+
if other.include? i
|
12
|
+
return false
|
13
|
+
end
|
14
|
+
}
|
15
|
+
return true
|
16
|
+
end
|
17
|
+
|
18
|
+
def sum
|
19
|
+
self.inject(:+)
|
20
|
+
end
|
21
|
+
|
22
|
+
def mean
|
23
|
+
self.sum.to_f/self.size
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
class String
|
28
|
+
|
29
|
+
def downcase?
|
30
|
+
self[/[[:lower:]]/]
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
class PriorityQueue
|
35
|
+
# This assumes that elements in the queue
|
36
|
+
# have a numerical member named 'score'.
|
37
|
+
|
38
|
+
def initialize a=Array.new
|
39
|
+
@queue = Array.new a
|
40
|
+
sort!
|
41
|
+
end
|
42
|
+
|
43
|
+
def sort!
|
44
|
+
@queue.sort_by! { |i| -i.score }
|
45
|
+
end
|
46
|
+
|
47
|
+
def pop
|
48
|
+
@queue.pop
|
49
|
+
end
|
50
|
+
|
51
|
+
def push i
|
52
|
+
@queue << i
|
53
|
+
sort!
|
54
|
+
end
|
55
|
+
|
56
|
+
def empty?
|
57
|
+
@queue.empty?
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def spawn_with_timeout cmd, t=4, ignore_fail=false, debug=false
|
62
|
+
STDERR.write cmd+"\n" if debug
|
63
|
+
pipe_in, pipe_out = IO.pipe
|
64
|
+
pid = Process.spawn(cmd, :out => pipe_out)
|
65
|
+
begin
|
66
|
+
Timeout.timeout(t) { Process.wait pid }
|
67
|
+
rescue Timeout::Error
|
68
|
+
Process.kill('TERM', pid) if !ignore_fail
|
69
|
+
end
|
70
|
+
pipe_out.close
|
71
|
+
return pipe_in.read
|
72
|
+
end
|
73
|
+
|
74
|
+
def read_phrase_table fn
|
75
|
+
table = {}
|
76
|
+
f = ReadFile.new fn
|
77
|
+
while raw_rule = f.gets
|
78
|
+
french, english, features = splitpipe(raw_rule)
|
79
|
+
feature_map = SparseVector.from_kv features
|
80
|
+
if table.has_key? french
|
81
|
+
table[french] << [english, feature_map ]
|
82
|
+
else
|
83
|
+
table[french] = [[english, feature_map]]
|
84
|
+
end
|
85
|
+
end
|
86
|
+
f.close
|
87
|
+
return table
|
88
|
+
end
|
89
|
+
|
90
|
+
def cdec_kbest cdec_bin, input, ini, weights, k, unique=true
|
91
|
+
require 'open3'
|
92
|
+
cmd = "echo \"#{input}\" | #{cdec_bin} -c #{ini} -w #{weights} -k #{k}"
|
93
|
+
cmd += " -r" if unique
|
94
|
+
o,_ = Open3.capture2 "#{cmd} 2>/dev/null"
|
95
|
+
a = []; j = -1
|
96
|
+
o.split("\n").map{ |i| j+=1; t=Translation.new; t.from_s(i, false, j); a << t }
|
97
|
+
return a
|
98
|
+
end
|
99
|
+
|
100
|
+
def read_config fn
|
101
|
+
f = ReadFile.new fn
|
102
|
+
cfg = {}
|
103
|
+
while line = f.gets
|
104
|
+
line.strip!
|
105
|
+
next if /^\s*$/.match line
|
106
|
+
next if line[0]=='#'
|
107
|
+
content = line.split('#', 2).first
|
108
|
+
k, v = content.split(/\s*=\s*/, 2)
|
109
|
+
k.strip!; v.strip!
|
110
|
+
cfg[k] = v
|
111
|
+
end
|
112
|
+
return cfg
|
113
|
+
end
|
114
|
+
|
@@ -0,0 +1,81 @@
|
|
1
|
+
# Semirings for directed acyclic graphs (dags) (also directed hypergraphs),
|
2
|
+
# as described in:
|
3
|
+
# 'Dynamic Programming Algorithms in
|
4
|
+
# Semiring and Hypergraph Frameworks' (Liang Huang)
|
5
|
+
#
|
6
|
+
|
7
|
+
class Semiring
|
8
|
+
attr_accessor :add, :multiply, :one, :null, :convert
|
9
|
+
end
|
10
|
+
|
11
|
+
class BooleanSemiring < Semiring
|
12
|
+
def initialize
|
13
|
+
@add = Proc.new { |a,b| a||b }
|
14
|
+
@multiply = Proc.new { |a,b| a&&b }
|
15
|
+
@one = true
|
16
|
+
@null = false
|
17
|
+
@convert = Proc.new { |v| true && v!=0 }
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class ViterbiSemiring < Semiring
|
22
|
+
def initialize
|
23
|
+
@add = Proc.new { |a,b| [a,b].max }
|
24
|
+
@multiply = Proc.new { |a,b| a*b }
|
25
|
+
@one = 1.0
|
26
|
+
@null = 0.0
|
27
|
+
@convert = Proc.new { |v| v }
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
class ViterbiLogSemiring < Semiring
|
32
|
+
def initialize
|
33
|
+
@add = Proc.new { |a,b| [a,b].max }
|
34
|
+
@multiply = Proc.new { |a,b| a+b }
|
35
|
+
@one = 0.0
|
36
|
+
@null = -1.0/0.0
|
37
|
+
@convert = Proc.new { |v| v }
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
class InsideSemiring < Semiring
|
42
|
+
def initialize
|
43
|
+
@add = Proc.new { |a,b| a+b }
|
44
|
+
@multiply = Proc.new { |a,b| a*b }
|
45
|
+
@one = 1.0
|
46
|
+
@null = 0.0
|
47
|
+
@convert = Proc.new { |v| v }
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
class RealSemiring < Semiring
|
52
|
+
def initialize
|
53
|
+
@add = Proc.new { |a,b| [a,b].min }
|
54
|
+
@multiply = Proc.new { |a,b| a+b }
|
55
|
+
@one = 0.0
|
56
|
+
@null = 1.0/0.0
|
57
|
+
@convert = Proc.new { |v| v }
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# for longest/worst paths
|
62
|
+
class RealxSemiring < Semiring
|
63
|
+
def initialize
|
64
|
+
@add = Proc.new { |a,b| [a,b].max }
|
65
|
+
@multiply = Proc.new { |a,b| a+b }
|
66
|
+
@one = -1.0/0.0
|
67
|
+
@null = 0.0
|
68
|
+
@convert = Proc.new { |v| v }
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
class CountingSemiring < Semiring
|
73
|
+
def initialize
|
74
|
+
@add = Proc.new { |a,b| a+b }
|
75
|
+
@multiply = Proc.new { |a,b| a*b }
|
76
|
+
@one = 1.0
|
77
|
+
@null = 0.0
|
78
|
+
@convert = Proc.new { |v| if v!=0 then 1 else 0 end }
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
@@ -0,0 +1,22 @@
|
|
1
|
+
def tokenize s
|
2
|
+
s.strip.split
|
3
|
+
end
|
4
|
+
|
5
|
+
def ngrams(s, n, fix=false)
|
6
|
+
a = tokenize s
|
7
|
+
a.each_with_index { |tok, i|
|
8
|
+
tok.strip!
|
9
|
+
0.upto([n-1, a.size-i-1].min) { |m|
|
10
|
+
yield a[i..i+m] if !fix||(fix&&a[i..i+m].size==n)
|
11
|
+
}
|
12
|
+
}
|
13
|
+
end
|
14
|
+
|
15
|
+
def bag_of_words s, stopwords=[]
|
16
|
+
s.strip.split.uniq.sort.reject{ |w| stopwords.include? w }
|
17
|
+
end
|
18
|
+
|
19
|
+
def splitpipe s, n=3
|
20
|
+
s.strip.split("|"*n)
|
21
|
+
end
|
22
|
+
|
data/lib/zipf/tfidf.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
module TFIDF
|
2
|
+
|
3
|
+
|
4
|
+
# returns key='raw frequency' for an
|
5
|
+
# array-like object
|
6
|
+
def TFIDF::tf array, stopwords=[]
|
7
|
+
v = {}; v.default = 0
|
8
|
+
array.uniq.each { |i|
|
9
|
+
next if stopwords.include? i
|
10
|
+
v[i] = array.count(i).to_f
|
11
|
+
}
|
12
|
+
return v
|
13
|
+
end
|
14
|
+
|
15
|
+
# smoothes raw frequencies of tf() in-place
|
16
|
+
# a is a smoothing term
|
17
|
+
def TFIDF::ntf hash, a=0.4
|
18
|
+
max = hash.values.max.to_f
|
19
|
+
hash.each_pair { |k,v|
|
20
|
+
hash[k] = a + (1-a)*(v/max)
|
21
|
+
}
|
22
|
+
end
|
23
|
+
|
24
|
+
# returns idf value for each word in a vocabulary
|
25
|
+
def TFIDF::idf list_of_hashes
|
26
|
+
vocab = list_of_hashes.values.flatten.uniq
|
27
|
+
n = list_of_hashes.size.to_f
|
28
|
+
idf = {}
|
29
|
+
vocab.each { |i|
|
30
|
+
df = list_of_hashes.values.flatten.count i
|
31
|
+
idf[i] = Math.log(n/df)
|
32
|
+
}
|
33
|
+
return idf
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
end #module
|
38
|
+
|
metadata
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: zipf
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Patrick Simianer
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-06-16 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: NLP related tools and classes
|
14
|
+
email: p@simianer.de
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/zipf.rb
|
20
|
+
- lib/zipf/stringutil.rb
|
21
|
+
- lib/zipf/misc.rb
|
22
|
+
- lib/zipf/grammar.rb
|
23
|
+
- lib/zipf/hg.rb
|
24
|
+
- lib/zipf/fileutil.rb
|
25
|
+
- lib/zipf/semirings.rb
|
26
|
+
- lib/zipf/dag.rb
|
27
|
+
- lib/zipf/SparseVector.rb
|
28
|
+
- lib/zipf/tfidf.rb
|
29
|
+
- lib/zipf/bleu.rb
|
30
|
+
- lib/zipf/Translation.rb
|
31
|
+
homepage: http://simianer.de
|
32
|
+
licenses:
|
33
|
+
- MIT
|
34
|
+
metadata: {}
|
35
|
+
post_install_message:
|
36
|
+
rdoc_options: []
|
37
|
+
require_paths:
|
38
|
+
- lib
|
39
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - '>='
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: '0'
|
49
|
+
requirements: []
|
50
|
+
rubyforge_project:
|
51
|
+
rubygems_version: 2.0.3
|
52
|
+
signing_key:
|
53
|
+
specification_version: 4
|
54
|
+
summary: zipf
|
55
|
+
test_files: []
|