zipf 1.2.1 → 1.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/zipf/bleu.rb +55 -20
- data/lib/zipf/misc.rb +8 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5dc46e50a6d5f63fb3b1db68dce9cb35a8342172
|
4
|
+
data.tar.gz: d849ab3031db7b9b65f77e16d746792dcdaf4827
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 12d97d1af9aa19d63ed02b2c17c0b3506fecf9ed0b0c9cbea4f4e233ff87e4734c3a8a4c0f1754a9d09cdb48072d249e1479d89fae5b5175eb242d43e3b24aa7
|
7
|
+
data.tar.gz: 9e179f4165835a25984547f1f6a98166dd36747ec966f7adbc8bc5cbe561627040fc6ab7b03ff61f551b3fac0293d6c49ac117ef9d37b969cfd33fdb271482ea
|
data/lib/zipf/bleu.rb
CHANGED
@@ -62,19 +62,34 @@ class BLEU::Ngrams
|
|
62
62
|
end
|
63
63
|
end
|
64
64
|
|
65
|
-
def BLEU::
|
65
|
+
def BLEU::best_match_length hypothesis, references
|
66
|
+
hyp_len = hypothesis.strip.split.size
|
67
|
+
ref_lens = references.map { |r| r.strip.split.size }
|
68
|
+
min = Integer::MAX
|
69
|
+
min_idx = -1
|
70
|
+
ref_lens.each_with_index { |l,i|
|
71
|
+
min_idx = i if (hyp_len-l).abs < min
|
72
|
+
}
|
73
|
+
return hyp_len, ref_lens[min_idx]
|
74
|
+
end
|
75
|
+
|
76
|
+
def BLEU::get_counts hypothesis, references, n, times=1
|
66
77
|
p = NgramCounts.new n
|
67
|
-
r =
|
68
|
-
|
78
|
+
r = []
|
79
|
+
references.each { |reference|
|
80
|
+
r << Ngrams.new
|
81
|
+
ngrams(reference, n) { |ng| r.last.add ng }
|
82
|
+
}
|
69
83
|
h = Ngrams.new
|
70
84
|
ngrams(hypothesis, n) { |ng| h.add ng }
|
71
85
|
h.each { |ng,count|
|
72
86
|
sz = ng.size-1
|
73
87
|
p.sum[sz] += count * times
|
74
|
-
p.clipped[sz] += [r.get_count(ng), count].min * times
|
88
|
+
p.clipped[sz] += [r.map { |i| i.get_count(ng)}.max, count].min * times
|
75
89
|
}
|
76
|
-
p.ref_len =
|
77
|
-
p.hyp_len
|
90
|
+
p.hyp_len, p.ref_len = best_match_length hypothesis, references
|
91
|
+
p.hyp_len *= times
|
92
|
+
p.ref_len *= times
|
78
93
|
return p
|
79
94
|
end
|
80
95
|
|
@@ -82,45 +97,65 @@ def BLEU::brevity_penalty c, r, smooth=0.0
|
|
82
97
|
return [0.0, 1.0-((r+smooth)/c)].min
|
83
98
|
end
|
84
99
|
|
85
|
-
def BLEU::
|
100
|
+
def BLEU::bleu_ counts, n, debug=false
|
86
101
|
corpus_stats = NgramCounts.new n
|
87
102
|
counts.each { |i| corpus_stats.plus_eq i }
|
88
103
|
logbleu = 0.0
|
89
104
|
0.upto(n-1) { |m|
|
90
|
-
STDERR.write "#{m+1} #{corpus_stats.clipped[m]} / #{corpus_stats.sum[m]}\n" if debug
|
105
|
+
STDERR.write "#{m+1} #{corpus_stats.clipped[m]} / #{corpus_stats.sum[m]} = #{(corpus_stats.clipped[m]/corpus_stats.sum[m]).round 2}\n" if debug
|
91
106
|
return 0.0 if corpus_stats.clipped[m] == 0 or corpus_stats.sum == 0
|
92
107
|
logbleu += Math.log(corpus_stats.clipped[m]) - Math.log(corpus_stats.sum[m])
|
93
108
|
}
|
94
109
|
logbleu /= n
|
95
|
-
if debug
|
96
|
-
STDERR.write "BP #{brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len)}\n"
|
97
|
-
STDERR.write "sum #{Math.exp(sum)}\n"
|
98
|
-
end
|
110
|
+
STDERR.write "BP #{brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len).round 2}\n" if debug
|
99
111
|
logbleu += brevity_penalty corpus_stats.hyp_len, corpus_stats.ref_len
|
100
112
|
return Math.exp logbleu
|
101
113
|
end
|
102
114
|
|
103
|
-
def BLEU::
|
115
|
+
def BLEU::bleu hyp_file, ref_file, n, debug=false
|
116
|
+
hypotheses = ReadFile.readlines_strip(hyp_file)
|
117
|
+
references = ReadFile.readlines_strip(ref_file).map { |l|
|
118
|
+
splitpipe(l,3)
|
119
|
+
}
|
120
|
+
counts = []
|
121
|
+
hypotheses.each_with_index { |h,i|
|
122
|
+
counts << BLEU::get_counts(h, references[i], 4)
|
123
|
+
}
|
124
|
+
bleu_ counts, n, debug
|
125
|
+
end
|
126
|
+
|
127
|
+
def BLEU::hbleu_ counts, n, debug=false
|
104
128
|
(100*bleu(counts, n, debug)).round(3)
|
105
129
|
end
|
106
130
|
|
107
|
-
def BLEU::
|
108
|
-
|
109
|
-
|
131
|
+
def BLEU::hbleu hypotheses, references, n, debug=false
|
132
|
+
end
|
133
|
+
|
134
|
+
def BLEU::per_sentence_bleu hypothesis, references, n=4, smooth=0.0
|
135
|
+
h_ng = {}; r_ng = []
|
136
|
+
num_ref = references.size
|
137
|
+
num_ref.times { r_ng << {} }
|
138
|
+
(1).upto(n) { |i| h_ng[i] = []; num_ref.times { |j| r_ng[j][i] = [] } }
|
110
139
|
ngrams(hypothesis, n) { |i| h_ng[i.size] << i }
|
111
|
-
|
112
|
-
|
140
|
+
references.each_with_index { |reference,j|
|
141
|
+
ngrams(reference, n) { |i| r_ng[j][i.size] << i }
|
142
|
+
}
|
143
|
+
m = [n, references.map { |i| i.split.size }.max].min
|
113
144
|
add = 0.0
|
114
145
|
logbleu = 0.0
|
115
146
|
(1).upto(m) { |i|
|
116
147
|
counts_clipped = 0
|
117
148
|
counts_sum = h_ng[i].size
|
118
|
-
h_ng[i].uniq.each { |j|
|
149
|
+
h_ng[i].uniq.each { |j|
|
150
|
+
max_count = [h_ng[i].count(j), r_ng.map { |r| r[i].count(j) }.max].min
|
151
|
+
counts_clipped += max_count
|
152
|
+
}
|
119
153
|
add = 1.0 if i >= 2
|
120
154
|
logbleu += Math.log(counts_clipped+add) - Math.log(counts_sum+add);
|
121
155
|
}
|
122
156
|
logbleu /= m
|
123
|
-
|
157
|
+
hyp_len, best_ref_len = BLEU::best_match_length hypothesis, references
|
158
|
+
logbleu += brevity_penalty hyp_len, best_ref_len, smooth
|
124
159
|
return Math.exp logbleu
|
125
160
|
end
|
126
161
|
|
data/lib/zipf/misc.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zipf
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Patrick Simianer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-01-
|
11
|
+
date: 2015-01-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: json
|