RubyGems - zipf - Versions diffs - 1.2.1 → 1.2.2 - Mend

zipf 1.2.1 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 4a5374ff741398bdd58da174d27a130599c0573d
-  data.tar.gz: 4df6f82132a2a8c79d3af0608fa91de6515d2527
+  metadata.gz: 5dc46e50a6d5f63fb3b1db68dce9cb35a8342172
+  data.tar.gz: d849ab3031db7b9b65f77e16d746792dcdaf4827
 SHA512:
-  metadata.gz: 2a4ee64270b0af54115f2449774c6e632e9dc405f34169473318cb9c8fdccb52dfa470a5d7458f46bbf55e093086092354c38cbf05c4262a58497fd522540606
-  data.tar.gz: b8d1c18f2bda9ad022067ca2296aacfe71f0052ac8f0f54bf58178dae251f44ff07846f51e21f478e4ff6ae50df5a878591013c1bbdee2c80af2b7e9693cd7f8
+  metadata.gz: 12d97d1af9aa19d63ed02b2c17c0b3506fecf9ed0b0c9cbea4f4e233ff87e4734c3a8a4c0f1754a9d09cdb48072d249e1479d89fae5b5175eb242d43e3b24aa7
+  data.tar.gz: 9e179f4165835a25984547f1f6a98166dd36747ec966f7adbc8bc5cbe561627040fc6ab7b03ff61f551b3fac0293d6c49ac117ef9d37b969cfd33fdb271482ea

data/lib/zipf/bleu.rb CHANGED

@@ -62,19 +62,34 @@ class BLEU::Ngrams
   end
 end
-def BLEU::get_counts hypothesis, reference, n, times=1
+def BLEU::best_match_length hypothesis, references
+  hyp_len = hypothesis.strip.split.size
+  ref_lens = references.map { |r| r.strip.split.size }
+  min = Integer::MAX
+  min_idx = -1
+  ref_lens.each_with_index { |l,i|
+    min_idx = i if (hyp_len-l).abs < min
+  }
+  return hyp_len, ref_lens[min_idx]
+end
+def BLEU::get_counts hypothesis, references, n, times=1
   p = NgramCounts.new n
-  r = Ngrams.new
-  ngrams(reference, n) { |ng| r.add ng }
+  r = []
+  references.each { |reference|
+    r << Ngrams.new
+    ngrams(reference, n) { |ng| r.last.add ng }
+  }
   h = Ngrams.new
   ngrams(hypothesis, n) { |ng| h.add ng }
   h.each { |ng,count|
     sz = ng.size-1
     p.sum[sz] += count * times
-    p.clipped[sz] += [r.get_count(ng), count].min * times
+    p.clipped[sz] += [r.map { |i| i.get_count(ng)}.max, count].min * times
   }
-  p.ref_len = tokenize(reference.strip).size * times
-  p.hyp_len = tokenize(hypothesis.strip).size * times
+  p.hyp_len, p.ref_len = best_match_length hypothesis, references
+  p.hyp_len *= times
+  p.ref_len *= times
   return p
 end
@@ -82,45 +97,65 @@ def BLEU::brevity_penalty c, r, smooth=0.0
   return [0.0, 1.0-((r+smooth)/c)].min
 end
-def BLEU::bleu counts, n, debug=false
+def BLEU::bleu_ counts, n, debug=false
   corpus_stats = NgramCounts.new n
   counts.each { |i| corpus_stats.plus_eq i }
   logbleu = 0.0
   0.upto(n-1) { |m|
-    STDERR.write "#{m+1} #{corpus_stats.clipped[m]} / #{corpus_stats.sum[m]}\n" if debug
+    STDERR.write "#{m+1} #{corpus_stats.clipped[m]} / #{corpus_stats.sum[m]} = #{(corpus_stats.clipped[m]/corpus_stats.sum[m]).round 2}\n" if debug
     return 0.0 if corpus_stats.clipped[m] == 0 or corpus_stats.sum == 0
     logbleu += Math.log(corpus_stats.clipped[m]) - Math.log(corpus_stats.sum[m])
   }
   logbleu /= n
-  if debug
-    STDERR.write "BP #{brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len)}\n"
-    STDERR.write "sum #{Math.exp(sum)}\n"
-  end
+  STDERR.write "BP #{brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len).round 2}\n" if debug
   logbleu += brevity_penalty corpus_stats.hyp_len, corpus_stats.ref_len
   return Math.exp logbleu
 end
-def BLEU::hbleu counts, n, debug=false
+def BLEU::bleu hyp_file, ref_file, n, debug=false
+  hypotheses = ReadFile.readlines_strip(hyp_file)
+  references = ReadFile.readlines_strip(ref_file).map { |l|
+    splitpipe(l,3)
+  }
+  counts = []
+  hypotheses.each_with_index { |h,i|
+    counts << BLEU::get_counts(h, references[i], 4)
+  }
+  bleu_ counts, n, debug
+end
+def BLEU::hbleu_ counts, n, debug=false
   (100*bleu(counts, n, debug)).round(3)
 end
-def BLEU::per_sentence_bleu hypothesis, reference, n=4, smooth=0.0
-  h_ng = {}; r_ng = {}
-  (1).upto(n) { |i| h_ng[i] = []; r_ng[i] = [] }
+def BLEU::hbleu hypotheses, references, n, debug=false
+end
+def BLEU::per_sentence_bleu hypothesis, references, n=4, smooth=0.0
+  h_ng = {}; r_ng = []
+  num_ref = references.size
+  num_ref.times { r_ng << {} }
+  (1).upto(n) { |i| h_ng[i] = []; num_ref.times { |j| r_ng[j][i] = [] } }
   ngrams(hypothesis, n) { |i| h_ng[i.size] << i }
-  ngrams(reference, n) { |i| r_ng[i.size] << i }
-  m = [n, reference.split.size].min
+  references.each_with_index { |reference,j|
+    ngrams(reference, n) { |i| r_ng[j][i.size] << i }
+  }
+  m = [n, references.map { |i| i.split.size }.max].min
   add = 0.0
   logbleu = 0.0
   (1).upto(m) { |i|
     counts_clipped = 0
     counts_sum = h_ng[i].size
-    h_ng[i].uniq.each { |j| counts_clipped += r_ng[i].count(j) }
+    h_ng[i].uniq.each { |j|
+      max_count = [h_ng[i].count(j), r_ng.map { |r| r[i].count(j) }.max].min
+      counts_clipped += max_count
+    }
     add = 1.0 if i >= 2
     logbleu += Math.log(counts_clipped+add) - Math.log(counts_sum+add);
   }
   logbleu /= m
-  logbleu += brevity_penalty hypothesis.strip.split.size, reference.strip.split.size, smooth
+  hyp_len, best_ref_len = BLEU::best_match_length hypothesis, references
+  logbleu += brevity_penalty hyp_len, best_ref_len, smooth
   return Math.exp logbleu
 end

data/lib/zipf/misc.rb CHANGED

@@ -111,3 +111,11 @@ def read_config fn
   return cfg
 end
+# https://gist.github.com/pithyless/9738125
+class Integer
+  N_BYTES = [42].pack('i').size
+  N_BITS = N_BYTES * 16
+  MAX = 2 ** (N_BITS - 2) - 1
+  MIN = -MAX - 1
+end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: zipf
 version: !ruby/object:Gem::Version
-  version: 1.2.1
+  version: 1.2.2
 platform: ruby
 authors:
 - Patrick Simianer
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-01-24 00:00:00.000000000 Z
+date: 2015-01-25 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: json