zipf 1.2.1 → 1.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/zipf/bleu.rb +55 -20
- data/lib/zipf/misc.rb +8 -0
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 5dc46e50a6d5f63fb3b1db68dce9cb35a8342172
|
|
4
|
+
data.tar.gz: d849ab3031db7b9b65f77e16d746792dcdaf4827
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 12d97d1af9aa19d63ed02b2c17c0b3506fecf9ed0b0c9cbea4f4e233ff87e4734c3a8a4c0f1754a9d09cdb48072d249e1479d89fae5b5175eb242d43e3b24aa7
|
|
7
|
+
data.tar.gz: 9e179f4165835a25984547f1f6a98166dd36747ec966f7adbc8bc5cbe561627040fc6ab7b03ff61f551b3fac0293d6c49ac117ef9d37b969cfd33fdb271482ea
|
data/lib/zipf/bleu.rb
CHANGED
|
@@ -62,19 +62,34 @@ class BLEU::Ngrams
|
|
|
62
62
|
end
|
|
63
63
|
end
|
|
64
64
|
|
|
65
|
-
def BLEU::
|
|
65
|
+
def BLEU::best_match_length hypothesis, references
|
|
66
|
+
hyp_len = hypothesis.strip.split.size
|
|
67
|
+
ref_lens = references.map { |r| r.strip.split.size }
|
|
68
|
+
min = Integer::MAX
|
|
69
|
+
min_idx = -1
|
|
70
|
+
ref_lens.each_with_index { |l,i|
|
|
71
|
+
min_idx = i if (hyp_len-l).abs < min
|
|
72
|
+
}
|
|
73
|
+
return hyp_len, ref_lens[min_idx]
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def BLEU::get_counts hypothesis, references, n, times=1
|
|
66
77
|
p = NgramCounts.new n
|
|
67
|
-
r =
|
|
68
|
-
|
|
78
|
+
r = []
|
|
79
|
+
references.each { |reference|
|
|
80
|
+
r << Ngrams.new
|
|
81
|
+
ngrams(reference, n) { |ng| r.last.add ng }
|
|
82
|
+
}
|
|
69
83
|
h = Ngrams.new
|
|
70
84
|
ngrams(hypothesis, n) { |ng| h.add ng }
|
|
71
85
|
h.each { |ng,count|
|
|
72
86
|
sz = ng.size-1
|
|
73
87
|
p.sum[sz] += count * times
|
|
74
|
-
p.clipped[sz] += [r.get_count(ng), count].min * times
|
|
88
|
+
p.clipped[sz] += [r.map { |i| i.get_count(ng)}.max, count].min * times
|
|
75
89
|
}
|
|
76
|
-
p.ref_len =
|
|
77
|
-
p.hyp_len
|
|
90
|
+
p.hyp_len, p.ref_len = best_match_length hypothesis, references
|
|
91
|
+
p.hyp_len *= times
|
|
92
|
+
p.ref_len *= times
|
|
78
93
|
return p
|
|
79
94
|
end
|
|
80
95
|
|
|
@@ -82,45 +97,65 @@ def BLEU::brevity_penalty c, r, smooth=0.0
|
|
|
82
97
|
return [0.0, 1.0-((r+smooth)/c)].min
|
|
83
98
|
end
|
|
84
99
|
|
|
85
|
-
def BLEU::
|
|
100
|
+
def BLEU::bleu_ counts, n, debug=false
|
|
86
101
|
corpus_stats = NgramCounts.new n
|
|
87
102
|
counts.each { |i| corpus_stats.plus_eq i }
|
|
88
103
|
logbleu = 0.0
|
|
89
104
|
0.upto(n-1) { |m|
|
|
90
|
-
STDERR.write "#{m+1} #{corpus_stats.clipped[m]} / #{corpus_stats.sum[m]}\n" if debug
|
|
105
|
+
STDERR.write "#{m+1} #{corpus_stats.clipped[m]} / #{corpus_stats.sum[m]} = #{(corpus_stats.clipped[m]/corpus_stats.sum[m]).round 2}\n" if debug
|
|
91
106
|
return 0.0 if corpus_stats.clipped[m] == 0 or corpus_stats.sum == 0
|
|
92
107
|
logbleu += Math.log(corpus_stats.clipped[m]) - Math.log(corpus_stats.sum[m])
|
|
93
108
|
}
|
|
94
109
|
logbleu /= n
|
|
95
|
-
if debug
|
|
96
|
-
STDERR.write "BP #{brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len)}\n"
|
|
97
|
-
STDERR.write "sum #{Math.exp(sum)}\n"
|
|
98
|
-
end
|
|
110
|
+
STDERR.write "BP #{brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len).round 2}\n" if debug
|
|
99
111
|
logbleu += brevity_penalty corpus_stats.hyp_len, corpus_stats.ref_len
|
|
100
112
|
return Math.exp logbleu
|
|
101
113
|
end
|
|
102
114
|
|
|
103
|
-
def BLEU::
|
|
115
|
+
def BLEU::bleu hyp_file, ref_file, n, debug=false
|
|
116
|
+
hypotheses = ReadFile.readlines_strip(hyp_file)
|
|
117
|
+
references = ReadFile.readlines_strip(ref_file).map { |l|
|
|
118
|
+
splitpipe(l,3)
|
|
119
|
+
}
|
|
120
|
+
counts = []
|
|
121
|
+
hypotheses.each_with_index { |h,i|
|
|
122
|
+
counts << BLEU::get_counts(h, references[i], 4)
|
|
123
|
+
}
|
|
124
|
+
bleu_ counts, n, debug
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def BLEU::hbleu_ counts, n, debug=false
|
|
104
128
|
(100*bleu(counts, n, debug)).round(3)
|
|
105
129
|
end
|
|
106
130
|
|
|
107
|
-
def BLEU::
|
|
108
|
-
|
|
109
|
-
|
|
131
|
+
def BLEU::hbleu hypotheses, references, n, debug=false
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def BLEU::per_sentence_bleu hypothesis, references, n=4, smooth=0.0
|
|
135
|
+
h_ng = {}; r_ng = []
|
|
136
|
+
num_ref = references.size
|
|
137
|
+
num_ref.times { r_ng << {} }
|
|
138
|
+
(1).upto(n) { |i| h_ng[i] = []; num_ref.times { |j| r_ng[j][i] = [] } }
|
|
110
139
|
ngrams(hypothesis, n) { |i| h_ng[i.size] << i }
|
|
111
|
-
|
|
112
|
-
|
|
140
|
+
references.each_with_index { |reference,j|
|
|
141
|
+
ngrams(reference, n) { |i| r_ng[j][i.size] << i }
|
|
142
|
+
}
|
|
143
|
+
m = [n, references.map { |i| i.split.size }.max].min
|
|
113
144
|
add = 0.0
|
|
114
145
|
logbleu = 0.0
|
|
115
146
|
(1).upto(m) { |i|
|
|
116
147
|
counts_clipped = 0
|
|
117
148
|
counts_sum = h_ng[i].size
|
|
118
|
-
h_ng[i].uniq.each { |j|
|
|
149
|
+
h_ng[i].uniq.each { |j|
|
|
150
|
+
max_count = [h_ng[i].count(j), r_ng.map { |r| r[i].count(j) }.max].min
|
|
151
|
+
counts_clipped += max_count
|
|
152
|
+
}
|
|
119
153
|
add = 1.0 if i >= 2
|
|
120
154
|
logbleu += Math.log(counts_clipped+add) - Math.log(counts_sum+add);
|
|
121
155
|
}
|
|
122
156
|
logbleu /= m
|
|
123
|
-
|
|
157
|
+
hyp_len, best_ref_len = BLEU::best_match_length hypothesis, references
|
|
158
|
+
logbleu += brevity_penalty hyp_len, best_ref_len, smooth
|
|
124
159
|
return Math.exp logbleu
|
|
125
160
|
end
|
|
126
161
|
|
data/lib/zipf/misc.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: zipf
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.2.
|
|
4
|
+
version: 1.2.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Patrick Simianer
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2015-01-
|
|
11
|
+
date: 2015-01-25 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: json
|