zipf 1.2.1 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/zipf/bleu.rb +55 -20
  3. data/lib/zipf/misc.rb +8 -0
  4. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4a5374ff741398bdd58da174d27a130599c0573d
4
- data.tar.gz: 4df6f82132a2a8c79d3af0608fa91de6515d2527
3
+ metadata.gz: 5dc46e50a6d5f63fb3b1db68dce9cb35a8342172
4
+ data.tar.gz: d849ab3031db7b9b65f77e16d746792dcdaf4827
5
5
  SHA512:
6
- metadata.gz: 2a4ee64270b0af54115f2449774c6e632e9dc405f34169473318cb9c8fdccb52dfa470a5d7458f46bbf55e093086092354c38cbf05c4262a58497fd522540606
7
- data.tar.gz: b8d1c18f2bda9ad022067ca2296aacfe71f0052ac8f0f54bf58178dae251f44ff07846f51e21f478e4ff6ae50df5a878591013c1bbdee2c80af2b7e9693cd7f8
6
+ metadata.gz: 12d97d1af9aa19d63ed02b2c17c0b3506fecf9ed0b0c9cbea4f4e233ff87e4734c3a8a4c0f1754a9d09cdb48072d249e1479d89fae5b5175eb242d43e3b24aa7
7
+ data.tar.gz: 9e179f4165835a25984547f1f6a98166dd36747ec966f7adbc8bc5cbe561627040fc6ab7b03ff61f551b3fac0293d6c49ac117ef9d37b969cfd33fdb271482ea
@@ -62,19 +62,34 @@ class BLEU::Ngrams
62
62
  end
63
63
  end
64
64
 
65
- def BLEU::get_counts hypothesis, reference, n, times=1
65
+ def BLEU::best_match_length hypothesis, references
66
+ hyp_len = hypothesis.strip.split.size
67
+ ref_lens = references.map { |r| r.strip.split.size }
68
+ min = Integer::MAX
69
+ min_idx = -1
70
+ ref_lens.each_with_index { |l,i|
71
+ min_idx = i if (hyp_len-l).abs < min
72
+ }
73
+ return hyp_len, ref_lens[min_idx]
74
+ end
75
+
76
+ def BLEU::get_counts hypothesis, references, n, times=1
66
77
  p = NgramCounts.new n
67
- r = Ngrams.new
68
- ngrams(reference, n) { |ng| r.add ng }
78
+ r = []
79
+ references.each { |reference|
80
+ r << Ngrams.new
81
+ ngrams(reference, n) { |ng| r.last.add ng }
82
+ }
69
83
  h = Ngrams.new
70
84
  ngrams(hypothesis, n) { |ng| h.add ng }
71
85
  h.each { |ng,count|
72
86
  sz = ng.size-1
73
87
  p.sum[sz] += count * times
74
- p.clipped[sz] += [r.get_count(ng), count].min * times
88
+ p.clipped[sz] += [r.map { |i| i.get_count(ng)}.max, count].min * times
75
89
  }
76
- p.ref_len = tokenize(reference.strip).size * times
77
- p.hyp_len = tokenize(hypothesis.strip).size * times
90
+ p.hyp_len, p.ref_len = best_match_length hypothesis, references
91
+ p.hyp_len *= times
92
+ p.ref_len *= times
78
93
  return p
79
94
  end
80
95
 
@@ -82,45 +97,65 @@ def BLEU::brevity_penalty c, r, smooth=0.0
82
97
  return [0.0, 1.0-((r+smooth)/c)].min
83
98
  end
84
99
 
85
- def BLEU::bleu counts, n, debug=false
100
+ def BLEU::bleu_ counts, n, debug=false
86
101
  corpus_stats = NgramCounts.new n
87
102
  counts.each { |i| corpus_stats.plus_eq i }
88
103
  logbleu = 0.0
89
104
  0.upto(n-1) { |m|
90
- STDERR.write "#{m+1} #{corpus_stats.clipped[m]} / #{corpus_stats.sum[m]}\n" if debug
105
+ STDERR.write "#{m+1} #{corpus_stats.clipped[m]} / #{corpus_stats.sum[m]} = #{(corpus_stats.clipped[m]/corpus_stats.sum[m]).round 2}\n" if debug
91
106
  return 0.0 if corpus_stats.clipped[m] == 0 or corpus_stats.sum == 0
92
107
  logbleu += Math.log(corpus_stats.clipped[m]) - Math.log(corpus_stats.sum[m])
93
108
  }
94
109
  logbleu /= n
95
- if debug
96
- STDERR.write "BP #{brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len)}\n"
97
- STDERR.write "sum #{Math.exp(sum)}\n"
98
- end
110
+ STDERR.write "BP #{brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len).round 2}\n" if debug
99
111
  logbleu += brevity_penalty corpus_stats.hyp_len, corpus_stats.ref_len
100
112
  return Math.exp logbleu
101
113
  end
102
114
 
103
- def BLEU::hbleu counts, n, debug=false
115
+ def BLEU::bleu hyp_file, ref_file, n, debug=false
116
+ hypotheses = ReadFile.readlines_strip(hyp_file)
117
+ references = ReadFile.readlines_strip(ref_file).map { |l|
118
+ splitpipe(l,3)
119
+ }
120
+ counts = []
121
+ hypotheses.each_with_index { |h,i|
122
+ counts << BLEU::get_counts(h, references[i], 4)
123
+ }
124
+ bleu_ counts, n, debug
125
+ end
126
+
127
+ def BLEU::hbleu_ counts, n, debug=false
104
128
  (100*bleu(counts, n, debug)).round(3)
105
129
  end
106
130
 
107
- def BLEU::per_sentence_bleu hypothesis, reference, n=4, smooth=0.0
108
- h_ng = {}; r_ng = {}
109
- (1).upto(n) { |i| h_ng[i] = []; r_ng[i] = [] }
131
+ def BLEU::hbleu hypotheses, references, n, debug=false
132
+ end
133
+
134
+ def BLEU::per_sentence_bleu hypothesis, references, n=4, smooth=0.0
135
+ h_ng = {}; r_ng = []
136
+ num_ref = references.size
137
+ num_ref.times { r_ng << {} }
138
+ (1).upto(n) { |i| h_ng[i] = []; num_ref.times { |j| r_ng[j][i] = [] } }
110
139
  ngrams(hypothesis, n) { |i| h_ng[i.size] << i }
111
- ngrams(reference, n) { |i| r_ng[i.size] << i }
112
- m = [n, reference.split.size].min
140
+ references.each_with_index { |reference,j|
141
+ ngrams(reference, n) { |i| r_ng[j][i.size] << i }
142
+ }
143
+ m = [n, references.map { |i| i.split.size }.max].min
113
144
  add = 0.0
114
145
  logbleu = 0.0
115
146
  (1).upto(m) { |i|
116
147
  counts_clipped = 0
117
148
  counts_sum = h_ng[i].size
118
- h_ng[i].uniq.each { |j| counts_clipped += r_ng[i].count(j) }
149
+ h_ng[i].uniq.each { |j|
150
+ max_count = [h_ng[i].count(j), r_ng.map { |r| r[i].count(j) }.max].min
151
+ counts_clipped += max_count
152
+ }
119
153
  add = 1.0 if i >= 2
120
154
  logbleu += Math.log(counts_clipped+add) - Math.log(counts_sum+add);
121
155
  }
122
156
  logbleu /= m
123
- logbleu += brevity_penalty hypothesis.strip.split.size, reference.strip.split.size, smooth
157
+ hyp_len, best_ref_len = BLEU::best_match_length hypothesis, references
158
+ logbleu += brevity_penalty hyp_len, best_ref_len, smooth
124
159
  return Math.exp logbleu
125
160
  end
126
161
 
@@ -111,3 +111,11 @@ def read_config fn
111
111
  return cfg
112
112
  end
113
113
 
114
+ # https://gist.github.com/pithyless/9738125
115
+ class Integer
116
+ N_BYTES = [42].pack('i').size
117
+ N_BITS = N_BYTES * 16
118
+ MAX = 2 ** (N_BITS - 2) - 1
119
+ MIN = -MAX - 1
120
+ end
121
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: zipf
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.1
4
+ version: 1.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Patrick Simianer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-24 00:00:00.000000000 Z
11
+ date: 2015-01-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: json