RubyGems - company-mapping - Versions diffs - 0.1.0 → 0.2.0 - Mend

company-mapping 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +4 -4
data/.idea/company-mapping.iml +35 -1
data/.idea/workspace.xml +363 -105
data/company-mapping.gemspec +27 -27
data/lib/company/mapping.rb +20 -18
data/lib/company/mapping/company_mapper.rb +10 -18
data/lib/company/mapping/document_utils/basic_tokenizer.rb +19 -24
data/lib/company/mapping/document_utils/company_corpus.rb +32 -0
data/lib/company/mapping/document_utils/corpus.rb +3 -22
data/lib/company/mapping/document_utils/text_document.rb +1 -6
data/lib/company/mapping/tfidf/idf/inverse_document_frequency.rb +6 -32
data/lib/company/mapping/tfidf/tf/normalized_term_frequency.rb +2 -13
data/lib/company/mapping/tfidf/tf/term_frequency.rb +6 -19
data/lib/company/mapping/tfidf/tfidf.rb +20 -48
data/lib/company/mapping/vector_similarity/cosine_similarity.rb +9 -21
data/lib/company/mapping/version.rb +1 -1
metadata +5 -4

data/company-mapping.gemspec CHANGED Viewed

@@ -1,27 +1,27 @@
-# coding: utf-8
-lib = File.expand_path('../lib', __FILE__)
-$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
-require 'company/mapping/version'
-Gem::Specification.new do |spec|
-  spec.name          = "company-mapping"
-  spec.version       = Company::Mapping::VERSION
-  spec.authors       = ["vasgat"]
-  spec.email         = ["vasgat@gmail.com"]
-  spec.summary       = %q{Maps new companies with those in a given corpus.}
-  spec.description   = %q{Given a Corpus of WikiRate Company Names and a new (incoming) Company Name, CompanyMapper class, finds the closest match if exists based on the calculated tf-idf similarity and a defined threshold.}
-  spec.homepage      = %q{https://github.com/vasgat/company-mapping}
-  spec.license       = "MIT"
-  spec.files         = `git ls-files -z`.split("\x0").reject do |f|
-    f.match(%r{^(test|spec|features)/})
-  end
-  spec.bindir        = "exe"
-  spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
-  spec.require_paths = ["lib"]
-  spec.add_development_dependency "bundler", "~> 1.14"
-  spec.add_development_dependency "rake", "~> 10.0"
-  spec.add_development_dependency "minitest", "~> 5.0"
-end
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'company/mapping/version'
+Gem::Specification.new do |spec|
+  spec.name          = "company-mapping"
+  spec.version       = Company::Mapping::VERSION
+  spec.authors       = ["vasgat"]
+  spec.email         = ["vasgat@gmail.com"]
+  spec.summary       = %q{Maps new companies with those in a given corpus.}
+  spec.description   = %q{Given a Corpus of WikiRate Company Names and a new (incoming) Company Name, CompanyMapper class, finds the closest match if exists based on the calculated tf-idf similarity and a defined threshold.}
+  spec.homepage      = %q{https://github.com/vasgat/company-mapping}
+  spec.license       = "MIT"
+  spec.files         = `git ls-files -z`.split("\x0").reject do |f|
+    f.match(%r{^(test|spec|features)/})
+  end
+  spec.bindir        = "exe"
+  spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
+  spec.require_paths = ["lib"]
+  spec.add_development_dependency "bundler", "~> 1.14"
+  spec.add_development_dependency "rake", "~> 10.0"
+  spec.add_development_dependency "rspec", "~> 3.5"
+end

data/lib/company/mapping.rb CHANGED Viewed

@@ -1,18 +1,20 @@
-require_relative "mapping/version"
-module Company
-  module Mapping
-  end
-end
-require_relative 'mapping/document_utils/basic_tokenizer'
-require_relative 'mapping/document_utils/corpus'
-require_relative 'mapping/tfidf/tf/term_frequency'
-require 'securerandom'
-require_relative 'mapping/document_utils/text_document'
-require_relative 'mapping/tfidf/idf/inverse_document_frequency'
-require_relative 'mapping/tfidf/tf/normalized_term_frequency'
-require 'set'
-require_relative 'mapping/vector_similarity/cosine_similarity'
-require_relative 'mapping/tfidf/tfidf'
-require_relative 'mapping/company_mapper'
+require_relative "mapping/version"
+module Company
+  module Mapping
+  end
+end
+require 'csv'
+require 'set'
+require_relative 'mapping/document_utils/basic_tokenizer'
+require_relative 'mapping/document_utils/corpus'
+require_relative 'mapping/document_utils/company_corpus'
+require_relative 'mapping/tfidf/tf/term_frequency'
+require 'securerandom'
+require_relative 'mapping/document_utils/text_document'
+require_relative 'mapping/tfidf/idf/inverse_document_frequency'
+require_relative 'mapping/tfidf/tf/normalized_term_frequency'
+require_relative 'mapping/vector_similarity/cosine_similarity'
+require_relative 'mapping/tfidf/tfidf'
+require_relative 'mapping/company_mapper'

data/lib/company/mapping/company_mapper.rb CHANGED Viewed

@@ -16,27 +16,19 @@ module Company
       def map(company, threshold)
         @tfidf.calculate_tfidf_weights_of_new_document(company)
-        _maxSim = 0.0
-        _mapped_company = ""
+        maxSim = 0.0
+        mapped_company = ""
         @corpus.each do |d|
-          _similarity = @tfidf.similarity(d.id, company.id)
-          if (_maxSim < _similarity)
-            _maxSim = _similarity
-            _mapped_company = d.id
-            if (_maxSim == 1)
-              break
-            end
-          end
+          similarity = @tfidf.similarity(d.id, company.id)
+          next unless maxSim < similarity
+          maxSim = similarity
+          mapped_company = d.id
+          break if maxSim == 1
         end
-        if (_maxSim>threshold)
-          return _mapped_company.to_s.sub(/\_.*/, "")
-        else
-          return nil
-        end
+        return unless maxSim > threshold
+        mapped_company.to_s.sub(/\_.*/, "")
       end
     end
   end
-end
+end

data/lib/company/mapping/document_utils/basic_tokenizer.rb CHANGED Viewed

@@ -13,38 +13,33 @@ module Company
       end
       def tokenize(text)
-        _text = tranform(text)
-        _tokens = Array.new
-        _index = 0;
-        while (_index<_text.length)
-          _char = String(_text[_index])
-          if (_char.match(/\s/))
-            _index = _index+1
-          elsif (_char.match(/\w/))
-            _buf = StringIO.new("")
-            while ((_index < _text.length) && (_text[_index].match(/\w/)))
-              _buf << _text[_index]
-              _index += 1
+        text = tranform(text)
+        tokens = Array.new
+        index = 0
+        while (index < text.length)
+          char = text[index]
+          case char
+          when /\s/
+            index = index + 1
+          when /\w/ #/(?<word>\w+)/
+            buf = ""
+            while ((index < text.length) && (text[index].match(/\w/)))
+              buf << text[index]
+              index += 1
             end
-            _tokens.push(_buf.string)
+            tokens.push buf
+            index += 1
           else
-            if (!@doIgnorePunctuation)
-              _buf = StringIO.new("")
-              _buf << _char
-              _tokens.push(_buf.string)
-            end
-            _index += 1
+            tokens.push(char) unless @doIgnorePunctuation
+            index += 1
           end
         end
-        return _tokens
+        tokens
       end
       private
       def tranform(text)
-        if (@doIgnoreCase)
-          return text.to_s.downcase
-        end
-        return text.to_s
+        @doIgnoreCase ? text.to_s.downcase : text.to_s
       end
     end
   end

data/lib/company/mapping/document_utils/company_corpus.rb ADDED Viewed

@@ -0,0 +1,32 @@
+module Company
+  module Mapping
+    class CompanyCorpus < Corpus
+      def initialize(path=nil)
+        super()
+        import_csv path if path
+      end
+      # build a corpus from a csv file
+      def import_csv path
+        CSV.foreach(path) do |row|
+          array = row.first.split(";")
+          push doc(array[1], array.first)
+          array[2..-1].each_with_index do |company_alias, i|
+            push doc(company_alias, "#{array.first}_#{i}")
+          end
+        end
+        @corpus
+      end
+      private
+      def doc content, id
+        alias_doc = TextDocument.new
+        alias_doc.contents = content.gsub(",", "").gsub(".", "")
+        alias_doc.id = id
+        alias_doc
+      end
+    end
+  end
+end

data/lib/company/mapping/document_utils/corpus.rb CHANGED Viewed

@@ -1,26 +1,7 @@
 module Company
   module Mapping
-    class Corpus
-      def initialize
-        @corpus = Set.new
-      end
-      def push(document)
-        @corpus.add(document)
-      end
-      def size
-        return @corpus.size
-      end
-      def each
-        @corpus.each do |doc|
-          yield(doc)
-        end
-      end
+    class Corpus < Set
+      alias_method :push, :add
     end
   end
-end
+end

data/lib/company/mapping/document_utils/text_document.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module Company
   module Mapping
+    # A simple text document
     class TextDocument
       attr_accessor :id, :contents, :tokenizer
@@ -22,10 +22,6 @@ module Company
         o.class == self.class && o.state == self.state
       end
-      def info
-        return "A simple text document"
-      end
       def to_s
         "TextDocument:{#{id},#{contents}}"
       end
@@ -35,6 +31,5 @@ module Company
         [@id]
       end
     end
   end
 end

data/lib/company/mapping/tfidf/idf/inverse_document_frequency.rb CHANGED Viewed

@@ -4,7 +4,6 @@ module Company
 #InverseDocumentFrequency consists the basic implementation of inverse document frequency. It is the logarithmically
 #scaled inverse fraction of the documents that contain the token, obtained by dividing the total number of documents by
 #the number of documents containing the token, and then taking the logarithm of that quotient.
     class InverseDocumentFrequency
       def initialize(corpus)
@@ -13,50 +12,25 @@ module Company
       #Calculates the basic Inverse Document Frequency of each token contained in a corpus of documents.
       def calculate
-        _df = document_frequency
-        _idf = Hash.new
-        _df.each do |word, freq|
-          _idf[word] = Math.log(@corpus.size/freq)
+        document_frequency.each_with_object({}) do |(word, freq), idf|
+          idf[word] = Math.log(@corpus.size/freq)
         end
-        return _idf
-      end
-      def info
-        "The inverse document frequency is a measure of how much "
-        +"information the word provides, that is, whether the term is "
-        +"common or rare across all documents of a corpus. It is the logarithmically "
-        +"scaled inverse fraction of the documents that contain the token,"
-        +" obtained by dividing the total number of documents by the number "
-        +"of documents containing the token, and then taking the logarithm "
-        +"of that quotient."
       end
       def maxIDF
-        return Math.log(@corpus.size * 1.0)
+        Math.log(@corpus.size * 1.0)
       end
       protected
       #calculates the number of document occurrences of unique tokens within a corpus
       def document_frequency
-        _df = Hash.new
-        @corpus.each do |doc|
-          _words = doc.bag_of_words.keys
-          _words.each do |word|
-            if (_df.has_key?(word))
-              _df[word] = _df[word]+1.0
-            else
-              _df[word] = 1.0
-            end
+        @corpus.each_with_object({}) do |doc, df|
+          doc.bag_of_words.keys.each do |word|
+            df[word] = (df.fetch(word) { 0.0 }) + 1.0
           end
         end
-        return _df
       end
     end
   end
 end

data/lib/company/mapping/tfidf/tf/normalized_term_frequency.rb CHANGED Viewed

@@ -6,22 +6,11 @@ module Company
 # f(t,d) is zero.
     class NormalizedTermFrequency < TermFrequency
       def calculate(text)
-        _rawTF = rawFrequency(text)
-        _logTF = Hash.new
-        _rawTF.each do |key, value|
-          _logTF[key] = 1.0 + Math.log(value)
+        rawFrequency(text).each_with_object({}) do |(key, value), logTF|
+          logTF[key] = 1.0 + Math.log(value)
         end
-        return _logTF
-      end
-      def info
-        return "Logarithmically scaled term frequency: tf(t,d) = 1 + log(f(t,d)), or zero if ft,d is zero";
       end
     end
   end
 end

data/lib/company/mapping/tfidf/tf/term_frequency.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module Company
   module Mapping
+    # Raw term frequency (number of times a token appears in a given string - document)
     class TermFrequency
       def initialize(tokenizer)
@@ -9,29 +9,16 @@ module Company
       #Calculates the raw term frequency given the contents of the document.
       def calculate(text)
-        return rawFrequency(text)
-      end
-      def info
-        return "Raw term frequency (number of times a token appears in a given string - document)"
+        rawFrequency(text)
       end
       protected
       def rawFrequency(contents)
-        _tokens = @tokenizer.tokenize(contents)
-        _tf = Hash.new
-        _tokens.each {
-            |_token|
-          if (!_tf.has_key?(_token))
-            _tf[_token] = 1
-          else
-            _tf[_token] = _tf[_token] + 1
-          end
-        }
-        return _tf
+        @tokenizer.tokenize(contents).each_with_object({}) do |token, tf|
+          tf[token] ||= 0
+          tf[token] += 1
+        end
       end
     end
   end
 end

data/lib/company/mapping/tfidf/tfidf.rb CHANGED Viewed

@@ -1,7 +1,8 @@
 module Company
   module Mapping
-#TFIDF class implements Term Frequency Inverse Document Frequency statistic.
+#TFIDF class implements Term Frequency Inverse Document Frequency statistic. Term frequency–inverse document frequency,
+# is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.
     class TFIDF
       attr_accessor :tf, :idf
@@ -13,67 +14,38 @@ module Company
       def calculate
         @tfidf = Hash.new
-        if (@idf==nil)
-          @idf = InverseDocumentFrequency.new(@corpus)
-        end
-        if (@tf==nil)
-          _tokenizer = BasicTokenizer.new
-          @tf = NormalizedTermFrequency.new(_tokenizer)
-        end
+        @idf ||= InverseDocumentFrequency.new(@corpus)
+        @tf ||= NormalizedTermFrequency.new(BasicTokenizer.new)
         @idf_weights = @idf.calculate
-        @corpus.each {
-            |doc|
-          _termfreq = @tf.calculate(doc.contents)
-          _tfidf_weights = Hash.new
-          _termfreq.each do |term, tf|
-            _weight = tf * @idf_weights[term]
-            _tfidf_weights[term] = _weight
-          end
+        @corpus.each do |doc|
+          termfreq = @tf.calculate(doc.contents)
-          @tfidf[doc.id] = _tfidf_weights
-        }
-        return @tfidf
+          @tfidf[doc.id] =
+              termfreq.each_with_object({}) do |(term, tf), tfidf_weights|
+                weight = tf * @idf_weights[term]
+                tfidf_weights[term] = weight
+              end
+        end
+        @tfidf
       end
       #Calculates tfidf weights of new incoming document without importing the document in the corpus and re-calculating the tf-idf weights for the entire corpus
       def calculate_tfidf_weights_of_new_document(new_doc)
-        _termfreq = @tf.calculate(new_doc.contents)
-        _tfidf_weights = Hash.new
+        termfreq = @tf.calculate(new_doc.contents)
-        _termfreq.each do |term, tf|
-          if (@idf_weights.has_key? term)
-            _weight = tf * @idf_weights[term]
-          else
-            _weight = tf * @idf.maxIDF
-          end
-          _tfidf_weights[term] = _weight
+        @tfidf[new_doc.id] = termfreq.each_with_object({}) do |(term, tf), tfidf_weights|
+          weight = tf * (@idf_weights[term] || @idf.maxIDF)
+          tfidf_weights[term] = weight
         end
-        @tfidf[new_doc.id] = _tfidf_weights
-        return @tfidf
+        @tfidf
       end
       #Calculates tf-idf similarity between two given documents. It is actually
       #the calculated Cosine Similarity by using tf*idf weights.
       def similarity(doc1_id, doc2_id)
-        if (@tfidf==nil)
-          calculate
-        end
-        _cosine_similarity = CosineSimilarity.new
-        return _cosine_similarity.calculate(@tfidf[doc1_id], @tfidf[doc2_id])
-      end
-      def info
-        " term frequency–inverse document frequency, is a numerical "
-        +"statistic that is intended to reflect how important a word "
-        +"is to a document in a collection or corpus"
+        @tfidf ||= calculate
+        CosineSimilarity.new.calculate(@tfidf[doc1_id], @tfidf[doc2_id])
       end
     end