RubyGems - crosslanguagespotter - Versions diffs - 0.0.2-java - Mend

crosslanguagespotter 0.0.2-java

Files changed (92) hide show

checksums.yaml +7 -0
data/.gitignore +19 -0
data/Gemfile +3 -0
data/Rakefile +13 -0
data/crosslanguagespotter.gemspec +36 -0
data/examples/ex1.rb +13 -0
data/examples/services_example.rb +13 -0
data/lib/crosslanguagespotter/basic.rb +157 -0
data/lib/crosslanguagespotter/context.rb +139 -0
data/lib/crosslanguagespotter/figures_evaluator.rb +160 -0
data/lib/crosslanguagespotter/jaccard.rb +114 -0
data/lib/crosslanguagespotter/methods/context.rb +127 -0
data/lib/crosslanguagespotter/methods/jaro.rb +118 -0
data/lib/crosslanguagespotter/methods/tversky.rb +44 -0
data/lib/crosslanguagespotter/model_loading.rb +333 -0
data/lib/crosslanguagespotter/oracle.rb +261 -0
data/lib/crosslanguagespotter/report.rb +88 -0
data/lib/crosslanguagespotter/version.rb +5 -0
data/lib/crosslanguagespotter/wekaintegration.rb +83 -0
data/lib/crosslanguagespotter.rb +7 -0
data/lib/jars/weka.jar +0 -0
data/resources/css/bootstrap-theme.css +346 -0
data/resources/css/bootstrap-theme.min.css +7 -0
data/resources/css/bootstrap.css +5780 -0
data/resources/css/bootstrap.min.css +7 -0
data/resources/css/highlightstyles/arta.css +160 -0
data/resources/css/highlightstyles/ascetic.css +50 -0
data/resources/css/highlightstyles/atelier-dune.dark.css +93 -0
data/resources/css/highlightstyles/atelier-dune.light.css +93 -0
data/resources/css/highlightstyles/atelier-forest.dark.css +93 -0
data/resources/css/highlightstyles/atelier-forest.light.css +93 -0
data/resources/css/highlightstyles/atelier-heath.dark.css +93 -0
data/resources/css/highlightstyles/atelier-heath.light.css +93 -0
data/resources/css/highlightstyles/atelier-lakeside.dark.css +93 -0
data/resources/css/highlightstyles/atelier-lakeside.light.css +93 -0
data/resources/css/highlightstyles/atelier-seaside.dark.css +93 -0
data/resources/css/highlightstyles/atelier-seaside.light.css +93 -0
data/resources/css/highlightstyles/brown_paper.css +105 -0
data/resources/css/highlightstyles/brown_papersq.png +0 -0
data/resources/css/highlightstyles/dark.css +105 -0
data/resources/css/highlightstyles/default.css +153 -0
data/resources/css/highlightstyles/docco.css +132 -0
data/resources/css/highlightstyles/far.css +113 -0
data/resources/css/highlightstyles/foundation.css +133 -0
data/resources/css/highlightstyles/github.css +125 -0
data/resources/css/highlightstyles/googlecode.css +147 -0
data/resources/css/highlightstyles/idea.css +122 -0
data/resources/css/highlightstyles/ir_black.css +105 -0
data/resources/css/highlightstyles/magula.css +123 -0
data/resources/css/highlightstyles/mono-blue.css +62 -0
data/resources/css/highlightstyles/monokai.css +127 -0
data/resources/css/highlightstyles/monokai_sublime.css +149 -0
data/resources/css/highlightstyles/obsidian.css +154 -0
data/resources/css/highlightstyles/paraiso.dark.css +93 -0
data/resources/css/highlightstyles/paraiso.light.css +93 -0
data/resources/css/highlightstyles/pojoaque.css +106 -0
data/resources/css/highlightstyles/pojoaque.jpg +0 -0
data/resources/css/highlightstyles/railscasts.css +182 -0
data/resources/css/highlightstyles/rainbow.css +112 -0
data/resources/css/highlightstyles/school_book.css +113 -0
data/resources/css/highlightstyles/school_book.png +0 -0
data/resources/css/highlightstyles/solarized_dark.css +107 -0
data/resources/css/highlightstyles/solarized_light.css +107 -0
data/resources/css/highlightstyles/sunburst.css +160 -0
data/resources/css/highlightstyles/tomorrow-night-blue.css +93 -0
data/resources/css/highlightstyles/tomorrow-night-bright.css +92 -0
data/resources/css/highlightstyles/tomorrow-night-eighties.css +92 -0
data/resources/css/highlightstyles/tomorrow-night.css +93 -0
data/resources/css/highlightstyles/tomorrow.css +90 -0
data/resources/css/highlightstyles/vs.css +89 -0
data/resources/css/highlightstyles/xcode.css +158 -0
data/resources/css/highlightstyles/zenburn.css +117 -0
data/resources/example.html +1501 -0
data/resources/js/bootstrap.js +1943 -0
data/resources/js/bootstrap.min.js +7 -0
data/resources/js/highlight.pack.js +1 -0
data/resources/services_example.html +141 -0
data/resources/template.html +61 -0
data/test/data/angular-puzzle.GS +111 -0
data/test/data/angular_puzzle/app.js +66 -0
data/test/data/angular_puzzle/index.html +67 -0
data/test/data/angular_puzzle/slidingPuzzle.js +203 -0
data/test/data/angular_puzzle/wordSearchPuzzle.js +270 -0
data/test/data/example.html +5 -0
data/test/data/example.js +4 -0
data/test/data/services/index.html +33 -0
data/test/data/services/script.js +15 -0
data/test/test_helper.rb +9 -0
data/test/test_parsing.rb +23 -0
data/test/test_spotter.rb +42 -0
data/test/test_wekaintegration.rb +43 -0
metadata +328 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 68ee2d427d4dacc3d22b88a1deababeb2a1bcafd
+  data.tar.gz: 65ab29b3ba450f824a4930bd0a436170efabff57
+SHA512:
+  metadata.gz: ec2f763115ece653ad3a1379f24397bd73e18f3d4d101b222a922fead038bb4498459319595ac86444e253176b66ca9ebe93f117632293f879ee1bbf2f348cb8
+  data.tar.gz: d7b1571637ec59db6e9519e035c4826fecde7d121b2e8f03898317f4eaf3cc237bb9253833f4f90357fa604fb1c0b31cfb9b1b27b496218b3066aa3034d4fae8

data/.gitignore ADDED Viewed

@@ -0,0 +1,19 @@
+Gemfile.lock
+*.gem
+*.rbc
+.bundle
+.config
+coverage
+InstalledFiles
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp
+# YARD artifacts
+.yardoc
+_yardoc
+doc/

data/Gemfile ADDED Viewed

@@ -0,0 +1,3 @@
+source "http://rubygems.org"
+gemspec

data/Rakefile ADDED Viewed

@@ -0,0 +1,13 @@
+require 'rake/testtask'
+require 'rubygems/tasks'
+Rake::TestTask.new do |t|
+  t.libs << 'test'
+end
+Gem::Tasks.new do |tasks|
+  tasks.console.command = 'jruby'
+end
+desc "Run tests"
+task :default => :test

data/crosslanguagespotter.gemspec ADDED Viewed

@@ -0,0 +1,36 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'crosslanguagespotter/version'
+Gem::Specification.new do |s|
+  s.platform    = 'java'
+  s.name        = 'crosslanguagespotter'
+  s.version     = CrossLanguageSpotter::VERSION
+  s.summary     = "Automatic Spotter of Cross-Language references"
+  s.description = "Automatic Spotter of Cross-Language references"
+  s.authors     = ["Federico Tomassetti"]
+  s.email       = 'f.tomassetti@gmail.com'
+  s.homepage    = 'https://github.com/CrossLanguageProject/crosslanguagerelationsspotter'
+  s.license     = "Apache v2"
+  s.files         = `git ls-files`.split($/)
+  s.executables   = s.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  s.test_files    = s.files.grep(%r{^(test|spec|features)/})
+  s.require_paths = ["lib"]
+  s.add_dependency('codemodels')
+  s.add_dependency('codemodels-js')
+  s.add_dependency('codemodels-html')
+  s.add_dependency('codemodels-java')
+  s.add_dependency('codemodels-ruby')
+  s.add_dependency('codemodels-xml')
+  s.add_dependency('codemodels-properties')
+  s.add_dependency('htmlentities')
+  s.add_dependency('liquid')
+  s.add_development_dependency "bundler"
+  s.add_development_dependency "rake"
+  s.add_development_dependency "simplecov"
+  s.add_development_dependency "rubygems-tasks"
+end

data/examples/ex1.rb ADDED Viewed

@@ -0,0 +1,13 @@
+$: << './lib'
+require 'crosslanguagespotter'
+include CrossLanguageSpotter
+oracle_loader = OracleLoader.new
+classifier = oracle_loader.build_weka_classifier('./test/data/angular_puzzle','./test/data/angular-puzzle.GS')
+path = './test/data/angular_puzzle'
+spotter = CrossLanguageSpotter::Spotter.new()
+project = Project.new(path)
+relations = spotter.classify_relations(project,classifier)
+generate_report_file(relations,'resources/example.html')

data/examples/services_example.rb ADDED Viewed

@@ -0,0 +1,13 @@
+$: << './lib'
+require 'crosslanguagespotter'
+include CrossLanguageSpotter
+oracle_loader = OracleLoader.new
+classifier = oracle_loader.build_weka_classifier('./test/data/angular_puzzle','./test/data/angular-puzzle.GS')
+path = './test/data/services'
+spotter = CrossLanguageSpotter::Spotter.new()
+project = Project.new(path)
+relations = spotter.classify_relations(project,classifier)
+generate_report_file(relations,'resources/services_example.html')

data/lib/crosslanguagespotter/basic.rb ADDED Viewed

@@ -0,0 +1,157 @@
+# encoding: utf-8
+require "codemodels"
+require "codemodels/html"
+require "codemodels/js"
+require 'crosslanguagespotter/figures_evaluator'
+require 'crosslanguagespotter/methods/context'
+require 'crosslanguagespotter/methods/tversky'
+require 'crosslanguagespotter/methods/jaro'
+require 'crosslanguagespotter/model_loading'
+require 'csv'
+require 'set'
+require 'crosslanguagespotter/jaccard'
+module CrossLanguageSpotter
+    def self._load_models(dir,base_path='',models={})
+        Dir.foreach(dir) do |f|
+            if f!='.' and f!='..'
+                path = dir+'/'+f
+                if File.directory?(path)
+                    _load_models(path,base_path+'/'+dir,models)
+                else
+                    begin
+                        models[base_path+'/'+f] = CodeModels.parse_file(path)
+                    rescue Exception => e
+                        puts "No model available for #{path}: #{e}"
+                    end
+                end
+            end
+        end
+        return models
+    end
+    class Spotter
+        def initialize
+            @verbose = false
+        end
+        def find_relations(dir)
+            models = CrossLanguageSpotter._load_models(dir)
+            _calc(dir,models)
+        end
+        def features_for_dir(dir)
+            project = Project.new(dir,@verbose)
+            return features_for_project(project)
+        end
+        def classify_relations(project,classifier)
+            features_data = features_for_project(project)
+            data = []
+            list_of_original_features_rows = []
+            features_data.each do |rel,row|
+                row[:result] = false
+                data.push(row)
+                list_of_original_features_rows.push(row)
+            end
+            keys = {shared_length: :numeric,
+                tfidf_shared: :numeric,itfidf_shared: :numeric,
+                perc_shared_length_min: :numeric,
+                perc_shared_length_max: :numeric,
+                diff_min: :numeric,diff_max: :numeric,
+                perc_diff_min: :numeric,perc_diff_max: :numeric,
+                context: :numeric,jaccard: :numeric,jaro: :numeric,tversky: :numeric,
+                result: :boolean}
+            data_instances = hash2weka_instances("data",data,keys,:result)
+            classification = classifier.classify(data_instances)
+            i=0
+            results = []
+            classification.each do |c|
+                if c[:result]
+                    # just put a true in the real relations
+                    list_of_original_features_rows[i][:result] = true
+                    results.push(list_of_original_features_rows[i])
+                end
+                i+=1
+            end
+            return results
+        end
+        def features_for_project(project)
+            results = {}
+            tversky_producer = TverskyReferencesProducer.new  ({:alpha => 0.5, :threshold => 0.0})
+            context_producer  = ContextReferencesProducer.new ({:alpha => 1.0, :threshold => 0.0})
+            context_points_map = context_producer.points_map(project)
+            jaro_producer = JaroReferencesProducer.new ({:winkleradjust=>false,:threshold=>0.0})
+            block = Proc.new do |ni,nj|
+                context_ni = context(ni).values & project.shared_ids
+                context_nj = context(nj).values & project.shared_ids
+                shared_length = (context_ni & context_nj).count
+                file_i = ni.source.artifact(:absolute).filename
+                file_j = ni.source.artifact(:absolute).filename
+                tfidf_shared = 0
+                itfidf_shared = 0
+                (context_ni & context_nj).each do |v|
+                    tfidf_shared += project.tf_idf(file_i,v)+project.tf_idf(file_j,v)
+                    itfidf_shared += project.itf_idf(file_i,v)+project.itf_idf(file_j,v)
+                end
+                perc_shared_length_i = context_ni.count==0 ? 0.0 : shared_length.to_f/context_ni.count.to_f
+                perc_shared_length_j = context_nj.count==0 ? 0.0 : shared_length.to_f/context_nj.count.to_f
+                perc_shared_length   = [perc_shared_length_i,perc_shared_length_j]
+                perc_shared_length_min = (perc_shared_length[0]<perc_shared_length[1]) ? perc_shared_length[0] : perc_shared_length[1]
+                perc_shared_length_max = (perc_shared_length[0]<perc_shared_length[1]) ? perc_shared_length[1] : perc_shared_length[0]
+                diffs = [context_ni.count-shared_length,context_nj.count-shared_length]
+                diff_min = diffs.min
+                diff_max = diffs.max
+                perc_diff_i = context_ni.count==0 ? 0.0 : diffs[0].to_f/context_ni.count.to_f
+                perc_diff_j = context_nj.count==0 ? 0.0 : diffs[1].to_f/context_nj.count.to_f
+                perc_diffs = [perc_diff_i,perc_diff_j]
+                perc_diff_min = (perc_diffs[0]<perc_diffs[1]) ? perc_diffs[0] : perc_diffs[1]
+                perc_diff_max = (perc_diffs[0]<perc_diffs[1]) ? perc_diffs[1] : perc_diffs[0]
+                id_i = NodeId.from_node(ni)
+                id_j = NodeId.from_node(nj)
+                rel = CrossLanguageRelation.new([id_i,id_j])
+                jaccard = Jaccard.coefficient(context_ni,context_nj)
+                jaccard = 0.0 if jaccard.nan?
+                tversky = tversky_producer.tversky_coefficient(context_ni,context_nj)
+                tversky = 0.0 if tversky.nan?
+                context = context_points_map.points(Pair.new(ni,nj))
+                jaro    = jaro_producer.jaro_coefficient_from_nodes(ni,nj)
+                jaro    = 0.0 if jaro.nan?
+                results[rel] = {
+                    node_a_file:         ni.source.artifact(:absolute).filename,
+                    node_a_begin_line:   ni.source.position(:absolute).begin_line,
+                    node_a_end_line:     ni.source.position(:absolute).end_line,
+                    node_a_begin_column: ni.source.position(:absolute).begin_column,
+                    node_a_end_column:   ni.source.position(:absolute).end_column,
+                    node_b_file:         nj.source.artifact(:absolute).filename,
+                    node_b_begin_line:   nj.source.position(:absolute).begin_line,
+                    node_b_end_line:     nj.source.position(:absolute).end_line,
+                    node_b_begin_column: nj.source.position(:absolute).begin_column,
+                    node_b_end_column:   nj.source.position(:absolute).end_column,
+                    shared_length:shared_length,
+                    tfidf_shared:tfidf_shared,itfidf_shared:itfidf_shared,
+                    perc_shared_length_min:perc_shared_length_min,
+                    perc_shared_length_max:perc_shared_length_max,
+                    diff_min:diff_min,diff_max:diff_max,
+                    perc_diff_min:perc_diff_min,perc_diff_max:perc_diff_max,
+                    context:context,jaccard:jaccard,jaro:jaro,tversky:tversky}
+            end
+            project.iter_over_shared_ids_instances {|ni,nj| block.call(ni,nj) }
+            return results
+        end
+    end
+end

data/lib/crosslanguagespotter/context.rb ADDED Viewed

@@ -0,0 +1,139 @@
+require 'crosslanguagespotter/model_loading'
+require 'set'
+module CrossLanguageSpotter
+def collect_values_with_declarator(node)
+    declarators_per_value = Hash.new {|h,k| h[k]=[]}
+    self.class.ecore.eAllAttributes.each do |a|
+        v = self.send(:"#{a.name}")
+        if v!=nil
+            if a.many
+                v.each {|el| values[el]+=1}
+            else
+                values[v]+=1
+            end
+        end
+    end
+    values
+end
+class Context
+    attr_reader :sequence_of_values
+    def initialize
+        @map = Hash.new {|h,k| h[k]=[]}
+        @sequence_of_values = []
+        @register_sequence = []
+    end
+    def values
+        @map.keys.select {|k| @map[k].count>0}
+    end
+    def count
+        values.count
+    end
+    def has_value?(v)
+        values.include?(v)
+    end
+    def declarators_per_value(value)
+        @map[value]
+    end
+    def sequence_of_values
+        @sequence_of_values
+    end
+    def register(value,declarator)
+        @sequence_of_values << value
+        @map[value] << declarator unless @map[value].include?(declarator)
+        @register_sequence << {value:value, declarator:value}
+    end
+    def merge(other)
+        other.values.each do |v|
+            other.declarators_per_value(v).each do |d|
+                register(v,d)
+            end
+        end
+    end
+    def clone
+        new_instance = Context.new
+        @register_sequence.each do |r|
+            new_instance.register(r[:value],r[:declarator])
+        end
+        new_instance
+    end
+    def intersection(values)
+        new_instance = self.clone
+        new_instance.intersection!(values)
+        new_instance
+    end
+    def intersection!(values)
+        @map.keys.each do |k|
+            if values.is_a? Array
+                @map[k] = [] unless values.include?(k)
+            elsif values.is_a? Context
+                if values.has_value?(k)
+                    @map[k].concat(values.declarators_per_value(k))
+                else
+                    @map[k] = []
+                end
+            else
+                raise "error"
+            end
+        end
+        self
+    end
+    def count
+        values.count
+    end
+    def to_a
+        a = []
+        values.sort.each do |v|
+            a << {value:v,declarators:declarators_per_value(v)}
+        end
+        a
+    end
+    def to_s
+        to_a.to_s
+    end
+end
+def context(node)
+    ctx = Context.new
+    container = node.container_also_foreign
+    if container
+        ctx.merge(context(container))
+        # RGen attributes of the father
+        container.collect_values_with_count.keys.each do |value|
+            ctx.register(value,container)
+        end
+        # siblings in different containment reference
+        container.all_children_also_foreign.each do |sibling|
+            if (sibling.eContainingFeature!=node.eContainingFeature) || (node.eContainingFeature==nil && node!=sibling)
+                sibling.traverse(:also_foreign) do |n|
+                    n.collect_values_with_count.keys.each do |value|
+                        ctx.register(value,n)
+                    end
+                end
+            end
+        end
+    end
+    ctx
+end
+end

data/lib/crosslanguagespotter/figures_evaluator.rb ADDED Viewed

@@ -0,0 +1,160 @@
+module CrossLanguageSpotter
+class CrossLanguageReferencesProducer
+    def initialize(parameters)
+    end
+    # It should produce a set of CrossLanguageRelation
+    def produce_set(project)
+    end
+end
+# It compare different methods, each methods can be instantiated
+# different times using different parameters
+class CrossLanguageReferencesProducerMethodsComparator
+    # map per class, per params of the figures obtained agains the given gold set
+    attr_reader :results
+    def initialize(gold_set,project)
+        @gold_set = gold_set
+        @results = Hash.new {|h,k| h[k]={}}
+        @project = project
+    end
+    def add(clazz,parameters)
+        producer = clazz.new(parameters)
+        observed_set = producer.produce_set(@project)
+        fe = FiguresEvaluator.new(@gold_set,observed_set)
+        result = fe.all_figures
+        @results[clazz][parameters] = result
+        result
+    end
+end
+class NodeId
+    attr_reader :file
+    attr_reader :node_index
+    def index
+        @node_index
+    end
+    def self.from_node(node)
+        new(node.source.artifact.final_host.filename,traverse_index(node))
+    end
+    def initialize(file,node_index)
+        @file = file
+        @node_index = node_index
+    end
+    def eql?(other)
+        return false unless other.is_a?(NodeId)
+        self.file.eql?(other.file) && self.node_index.eql?(other.node_index)
+    end
+    def ==(other)
+        return self.eql?(other)
+    end
+    def hash
+        @file.hash*7+@node_index.hash
+    end
+    def <=>(other)
+        res = self.file <=> other.file
+        if res==0
+            self.node_index <=> other.node_index
+        else
+            res
+        end
+    end
+    def to_s
+        "#{@file}:#{@node_index}"
+    end
+end
+# It is a set of two node_ids (unordered)
+class CrossLanguageRelation
+    attr_reader :node_ids
+    def initialize(node_ids)
+        raise "Two elements expected, #{node_ids.count} found" unless node_ids.count==2
+        node_id_a = node_ids[0]
+        node_id_b = node_ids[1]
+        if (node_id_a<=>node_id_b)<0
+            @node_ids = [node_id_a,node_id_b]
+        else
+            @node_ids = [node_id_b,node_id_a]
+        end
+        #puts "SORTING GAVE #{@node_ids}"
+    end
+    def eql?(other)
+        return false unless other.is_a?(CrossLanguageRelation)
+        self.node_ids.eql?(other.node_ids)
+    end
+    def ==(other)
+        return self.eql?(other)
+    end
+    def hash
+        @node_ids[0].hash*7+@node_ids[1].hash
+    end
+    def to_s
+        "CrossLanguageRelation #{@node_ids[0]} <-> #{@node_ids[1]}"
+    end
+end
+# Calculates precision, recall, f-measure
+class FiguresEvaluator
+    # Gold set is the "truth", observed is calculated from
+    # some method and compared with the gold set
+    def initialize(gold_set,observed_set)
+        @gold_set     = gold_set
+        @observed_set = observed_set
+    end
+    def precision
+        @precision = calc_precision unless @precision
+        @precision
+    end
+    def recall
+        @recall = calc_recall unless @recall
+        @recall
+    end
+    def f_measure(beta=1.0)
+        beta_square = beta**2.0
+        (2*(beta_square)*precision*recall)/(beta_square*precision+recall)
+    end
+    def all_figures(beta=1.0)
+        {precision:precision,recall:recall,f_measure:f_measure(beta),beta:beta}
+    end
+    private
+    def calc_precision
+        intersection_size = @gold_set.intersection(@observed_set).count.to_f
+        intersection_size/@observed_set.count.to_f
+    end
+    def calc_recall
+        intersection_size = @gold_set.intersection(@observed_set).count.to_f
+        intersection_size/@gold_set.count.to_f
+    end
+end
+end

data/lib/crosslanguagespotter/jaccard.rb ADDED Viewed

@@ -0,0 +1,114 @@
+require "set"
+# Helpers to calculate the Jaccard Coefficient Index and related metrics easily.
+#
+# (from Wikipedia): The Jaccard coefficient measures similarity between sample sets, and is defined
+# as the size of the intersection divided by the size of the union of the sample sets.
+#
+# The closer to 1.0 this number is, the more similar two items are.
+module Jaccard
+  # Calculates the Jaccard Coefficient Index.
+  #
+  # +a+ must implement the set intersection and set union operators: <code>#&</code> and <code>#+</code>. Array and Set
+  # both implement these methods natively. It is expected that the results of <code>+</code> will either return a
+  # unique set or that it returns an object that responds to +#uniq!+. The results of +#coefficient+ will be
+  # wrong if the union contains duplicate elements.
+  #
+  # Also note that the individual items in +a+ and +b+ must implement a sane #eql? method.
+  # ActiveRecord::Base, String, Fixnum (but not Float), Array and Hash instances all implement
+  # a correct notion of equality. Other instances might have to be checked to ensure correct
+  # behavior.
+  #
+  # @param [#&, #+] a A set of items
+  # @param [#&, #+] b A second set of items
+  #
+  # @return [Float] The Jaccard Coefficient Index between +a+ and +b+.
+  #
+  # @example
+  #
+  #   a = [1, 2, 3, 4]
+  #   b = [1, 3, 4]
+  #   Jaccard.coefficient(a, b) #=> 0.75
+  #
+  # @see http://en.wikipedia.org/wiki/Jaccard_index Jaccard Coefficient Index on Wikipedia.
+  def self.coefficient(a, b)
+    raise ArgumentError, "#{a.inspect} does not implement #&" unless a.respond_to?(:&)
+    raise ArgumentError, "#{a.inspect} does not implement #+" unless a.respond_to?(:+)
+    intersection = a & b
+    union        = a + b
+    # Set does not implement #uniq or #uniq! since elements are
+    # always guaranteed to be present only once. That's the only
+    # reason we need to guard against that here.
+    union.uniq! if union.respond_to?(:uniq!)
+    intersection.length.to_f / union.length.to_f
+  end
+  # Calculates the inverse of the Jaccard coefficient.
+  #
+  # The closer to 0.0 the distance is, the more similar two items are.
+  #
+  # @return [Float] <code>1.0 - #coefficient(a, b)</code>
+  #
+  # @see Jaccard#coefficient for parameter calling convention and caveats about Array vs Set vs other object types.
+  def self.distance(a, b)
+    1.0 - coefficient(a, b)
+  end
+  # Determines which member of +others+ has the smallest distance vs +a+.
+  #
+  # Because of the implementation, if multiple items from +others+ have
+  # the same distance, the last one will be returned. If this is undesirable,
+  # reverse +others+ before calling #closest_to.
+  #
+  # @param [#&, #+] a A set of attributes
+  # @param [#inject] others A collection of set of attributes
+  #
+  # @return The item from +others+ with the distance minimized to 0.0.
+  #
+  # @example
+  #
+  #   a = [1, 2, 3]
+  #   b = [1, 3]
+  #   c = [1, 2, 3]
+  #   Jaccard.closest_to(b, [a, c]) #=> [1, 2, 3]
+  #   # Note that the actual instance returned will be c
+  def self.closest_to(a, others)
+    others.inject([2.0, nil]) do |memo, other|
+      dist = distance(a, other)
+      next memo if memo.first < dist
+      [dist, other]
+    end.last
+  end
+  # Returns the pair of items whose distance is minimized.
+  #
+  # @param [#each] items A collection of attributes.
+  #
+  # @return [Array<a, b>] A pair of set of attributes whose Jaccard distance is the minimal, given the input set.
+  #
+  # @example
+  #
+  #   a = [1, 2, 3]
+  #   b = [1, 2]
+  #   c = [1, 3]
+  #   Jaccard.best_match([a, b, c]) #=> [[1, 2, 3], [1, 2]]
+  def self.best_match(items)
+    seen = Set.new
+    matches = []
+    items.each do |row|
+      items.each do |col|
+        next if row == col
+        next if seen.include?([row, col]) || seen.include?([col, row])
+        seen << [row, col]
+        matches << [distance(row, col), [row, col]]
+      end
+    end
+    matches.sort.first.last
+  end
+end