zipf 1.2.1 → 1.2.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/zipf/bleu.rb +55 -20
  3. data/lib/zipf/misc.rb +8 -0
  4. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4a5374ff741398bdd58da174d27a130599c0573d
4
- data.tar.gz: 4df6f82132a2a8c79d3af0608fa91de6515d2527
3
+ metadata.gz: 5dc46e50a6d5f63fb3b1db68dce9cb35a8342172
4
+ data.tar.gz: d849ab3031db7b9b65f77e16d746792dcdaf4827
5
5
  SHA512:
6
- metadata.gz: 2a4ee64270b0af54115f2449774c6e632e9dc405f34169473318cb9c8fdccb52dfa470a5d7458f46bbf55e093086092354c38cbf05c4262a58497fd522540606
7
- data.tar.gz: b8d1c18f2bda9ad022067ca2296aacfe71f0052ac8f0f54bf58178dae251f44ff07846f51e21f478e4ff6ae50df5a878591013c1bbdee2c80af2b7e9693cd7f8
6
+ metadata.gz: 12d97d1af9aa19d63ed02b2c17c0b3506fecf9ed0b0c9cbea4f4e233ff87e4734c3a8a4c0f1754a9d09cdb48072d249e1479d89fae5b5175eb242d43e3b24aa7
7
+ data.tar.gz: 9e179f4165835a25984547f1f6a98166dd36747ec966f7adbc8bc5cbe561627040fc6ab7b03ff61f551b3fac0293d6c49ac117ef9d37b969cfd33fdb271482ea
@@ -62,19 +62,34 @@ class BLEU::Ngrams
62
62
  end
63
63
  end
64
64
 
65
- def BLEU::get_counts hypothesis, reference, n, times=1
65
+ def BLEU::best_match_length hypothesis, references
66
+ hyp_len = hypothesis.strip.split.size
67
+ ref_lens = references.map { |r| r.strip.split.size }
68
+ min = Integer::MAX
69
+ min_idx = -1
70
+ ref_lens.each_with_index { |l,i|
71
+ min_idx = i if (hyp_len-l).abs < min
72
+ }
73
+ return hyp_len, ref_lens[min_idx]
74
+ end
75
+
76
+ def BLEU::get_counts hypothesis, references, n, times=1
66
77
  p = NgramCounts.new n
67
- r = Ngrams.new
68
- ngrams(reference, n) { |ng| r.add ng }
78
+ r = []
79
+ references.each { |reference|
80
+ r << Ngrams.new
81
+ ngrams(reference, n) { |ng| r.last.add ng }
82
+ }
69
83
  h = Ngrams.new
70
84
  ngrams(hypothesis, n) { |ng| h.add ng }
71
85
  h.each { |ng,count|
72
86
  sz = ng.size-1
73
87
  p.sum[sz] += count * times
74
- p.clipped[sz] += [r.get_count(ng), count].min * times
88
+ p.clipped[sz] += [r.map { |i| i.get_count(ng)}.max, count].min * times
75
89
  }
76
- p.ref_len = tokenize(reference.strip).size * times
77
- p.hyp_len = tokenize(hypothesis.strip).size * times
90
+ p.hyp_len, p.ref_len = best_match_length hypothesis, references
91
+ p.hyp_len *= times
92
+ p.ref_len *= times
78
93
  return p
79
94
  end
80
95
 
@@ -82,45 +97,65 @@ def BLEU::brevity_penalty c, r, smooth=0.0
82
97
  return [0.0, 1.0-((r+smooth)/c)].min
83
98
  end
84
99
 
85
- def BLEU::bleu counts, n, debug=false
100
+ def BLEU::bleu_ counts, n, debug=false
86
101
  corpus_stats = NgramCounts.new n
87
102
  counts.each { |i| corpus_stats.plus_eq i }
88
103
  logbleu = 0.0
89
104
  0.upto(n-1) { |m|
90
- STDERR.write "#{m+1} #{corpus_stats.clipped[m]} / #{corpus_stats.sum[m]}\n" if debug
105
+ STDERR.write "#{m+1} #{corpus_stats.clipped[m]} / #{corpus_stats.sum[m]} = #{(corpus_stats.clipped[m]/corpus_stats.sum[m]).round 2}\n" if debug
91
106
  return 0.0 if corpus_stats.clipped[m] == 0 or corpus_stats.sum == 0
92
107
  logbleu += Math.log(corpus_stats.clipped[m]) - Math.log(corpus_stats.sum[m])
93
108
  }
94
109
  logbleu /= n
95
- if debug
96
- STDERR.write "BP #{brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len)}\n"
97
- STDERR.write "sum #{Math.exp(sum)}\n"
98
- end
110
+ STDERR.write "BP #{brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len).round 2}\n" if debug
99
111
  logbleu += brevity_penalty corpus_stats.hyp_len, corpus_stats.ref_len
100
112
  return Math.exp logbleu
101
113
  end
102
114
 
103
- def BLEU::hbleu counts, n, debug=false
115
+ def BLEU::bleu hyp_file, ref_file, n, debug=false
116
+ hypotheses = ReadFile.readlines_strip(hyp_file)
117
+ references = ReadFile.readlines_strip(ref_file).map { |l|
118
+ splitpipe(l,3)
119
+ }
120
+ counts = []
121
+ hypotheses.each_with_index { |h,i|
122
+ counts << BLEU::get_counts(h, references[i], 4)
123
+ }
124
+ bleu_ counts, n, debug
125
+ end
126
+
127
+ def BLEU::hbleu_ counts, n, debug=false
104
128
  (100*bleu(counts, n, debug)).round(3)
105
129
  end
106
130
 
107
- def BLEU::per_sentence_bleu hypothesis, reference, n=4, smooth=0.0
108
- h_ng = {}; r_ng = {}
109
- (1).upto(n) { |i| h_ng[i] = []; r_ng[i] = [] }
131
+ def BLEU::hbleu hypotheses, references, n, debug=false
132
+ end
133
+
134
+ def BLEU::per_sentence_bleu hypothesis, references, n=4, smooth=0.0
135
+ h_ng = {}; r_ng = []
136
+ num_ref = references.size
137
+ num_ref.times { r_ng << {} }
138
+ (1).upto(n) { |i| h_ng[i] = []; num_ref.times { |j| r_ng[j][i] = [] } }
110
139
  ngrams(hypothesis, n) { |i| h_ng[i.size] << i }
111
- ngrams(reference, n) { |i| r_ng[i.size] << i }
112
- m = [n, reference.split.size].min
140
+ references.each_with_index { |reference,j|
141
+ ngrams(reference, n) { |i| r_ng[j][i.size] << i }
142
+ }
143
+ m = [n, references.map { |i| i.split.size }.max].min
113
144
  add = 0.0
114
145
  logbleu = 0.0
115
146
  (1).upto(m) { |i|
116
147
  counts_clipped = 0
117
148
  counts_sum = h_ng[i].size
118
- h_ng[i].uniq.each { |j| counts_clipped += r_ng[i].count(j) }
149
+ h_ng[i].uniq.each { |j|
150
+ max_count = [h_ng[i].count(j), r_ng.map { |r| r[i].count(j) }.max].min
151
+ counts_clipped += max_count
152
+ }
119
153
  add = 1.0 if i >= 2
120
154
  logbleu += Math.log(counts_clipped+add) - Math.log(counts_sum+add);
121
155
  }
122
156
  logbleu /= m
123
- logbleu += brevity_penalty hypothesis.strip.split.size, reference.strip.split.size, smooth
157
+ hyp_len, best_ref_len = BLEU::best_match_length hypothesis, references
158
+ logbleu += brevity_penalty hyp_len, best_ref_len, smooth
124
159
  return Math.exp logbleu
125
160
  end
126
161
 
@@ -111,3 +111,11 @@ def read_config fn
111
111
  return cfg
112
112
  end
113
113
 
114
+ # https://gist.github.com/pithyless/9738125
115
+ class Integer
116
+ N_BYTES = [42].pack('i').size
117
+ N_BITS = N_BYTES * 16
118
+ MAX = 2 ** (N_BITS - 2) - 1
119
+ MIN = -MAX - 1
120
+ end
121
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: zipf
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.1
4
+ version: 1.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Patrick Simianer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-24 00:00:00.000000000 Z
11
+ date: 2015-01-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: json