company-mapping 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,27 +1,27 @@
1
- # coding: utf-8
2
- lib = File.expand_path('../lib', __FILE__)
3
- $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
- require 'company/mapping/version'
5
-
6
- Gem::Specification.new do |spec|
7
- spec.name = "company-mapping"
8
- spec.version = Company::Mapping::VERSION
9
- spec.authors = ["vasgat"]
10
- spec.email = ["vasgat@gmail.com"]
11
-
12
- spec.summary = %q{Maps new companies with those in a given corpus.}
13
- spec.description = %q{Given a Corpus of WikiRate Company Names and a new (incoming) Company Name, CompanyMapper class, finds the closest match if exists based on the calculated tf-idf similarity and a defined threshold.}
14
- spec.homepage = %q{https://github.com/vasgat/company-mapping}
15
- spec.license = "MIT"
16
-
17
- spec.files = `git ls-files -z`.split("\x0").reject do |f|
18
- f.match(%r{^(test|spec|features)/})
19
- end
20
- spec.bindir = "exe"
21
- spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
22
- spec.require_paths = ["lib"]
23
-
24
- spec.add_development_dependency "bundler", "~> 1.14"
25
- spec.add_development_dependency "rake", "~> 10.0"
26
- spec.add_development_dependency "minitest", "~> 5.0"
27
- end
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'company/mapping/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "company-mapping"
8
+ spec.version = Company::Mapping::VERSION
9
+ spec.authors = ["vasgat"]
10
+ spec.email = ["vasgat@gmail.com"]
11
+
12
+ spec.summary = %q{Maps new companies with those in a given corpus.}
13
+ spec.description = %q{Given a Corpus of WikiRate Company Names and a new (incoming) Company Name, CompanyMapper class, finds the closest match if exists based on the calculated tf-idf similarity and a defined threshold.}
14
+ spec.homepage = %q{https://github.com/vasgat/company-mapping}
15
+ spec.license = "MIT"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
18
+ f.match(%r{^(test|spec|features)/})
19
+ end
20
+ spec.bindir = "exe"
21
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
22
+ spec.require_paths = ["lib"]
23
+
24
+ spec.add_development_dependency "bundler", "~> 1.14"
25
+ spec.add_development_dependency "rake", "~> 10.0"
26
+ spec.add_development_dependency "rspec", "~> 3.5"
27
+ end
@@ -1,18 +1,20 @@
1
- require_relative "mapping/version"
2
-
3
- module Company
4
- module Mapping
5
- end
6
- end
7
-
8
- require_relative 'mapping/document_utils/basic_tokenizer'
9
- require_relative 'mapping/document_utils/corpus'
10
- require_relative 'mapping/tfidf/tf/term_frequency'
11
- require 'securerandom'
12
- require_relative 'mapping/document_utils/text_document'
13
- require_relative 'mapping/tfidf/idf/inverse_document_frequency'
14
- require_relative 'mapping/tfidf/tf/normalized_term_frequency'
15
- require 'set'
16
- require_relative 'mapping/vector_similarity/cosine_similarity'
17
- require_relative 'mapping/tfidf/tfidf'
18
- require_relative 'mapping/company_mapper'
1
+ require_relative "mapping/version"
2
+
3
+ module Company
4
+ module Mapping
5
+ end
6
+ end
7
+
8
+ require 'csv'
9
+ require 'set'
10
+ require_relative 'mapping/document_utils/basic_tokenizer'
11
+ require_relative 'mapping/document_utils/corpus'
12
+ require_relative 'mapping/document_utils/company_corpus'
13
+ require_relative 'mapping/tfidf/tf/term_frequency'
14
+ require 'securerandom'
15
+ require_relative 'mapping/document_utils/text_document'
16
+ require_relative 'mapping/tfidf/idf/inverse_document_frequency'
17
+ require_relative 'mapping/tfidf/tf/normalized_term_frequency'
18
+ require_relative 'mapping/vector_similarity/cosine_similarity'
19
+ require_relative 'mapping/tfidf/tfidf'
20
+ require_relative 'mapping/company_mapper'
@@ -16,27 +16,19 @@ module Company
16
16
  def map(company, threshold)
17
17
  @tfidf.calculate_tfidf_weights_of_new_document(company)
18
18
 
19
- _maxSim = 0.0
20
- _mapped_company = ""
19
+ maxSim = 0.0
20
+ mapped_company = ""
21
21
  @corpus.each do |d|
22
- _similarity = @tfidf.similarity(d.id, company.id)
23
-
24
- if (_maxSim < _similarity)
25
- _maxSim = _similarity
26
- _mapped_company = d.id
27
- if (_maxSim == 1)
28
- break
29
- end
30
- end
22
+ similarity = @tfidf.similarity(d.id, company.id)
23
+ next unless maxSim < similarity
24
+ maxSim = similarity
25
+ mapped_company = d.id
26
+ break if maxSim == 1
31
27
  end
32
28
 
33
- if (_maxSim>threshold)
34
- return _mapped_company.to_s.sub(/\_.*/, "")
35
- else
36
- return nil
37
- end
29
+ return unless maxSim > threshold
30
+ mapped_company.to_s.sub(/\_.*/, "")
38
31
  end
39
32
  end
40
-
41
33
  end
42
- end
34
+ end
@@ -13,38 +13,33 @@ module Company
13
13
  end
14
14
 
15
15
  def tokenize(text)
16
- _text = tranform(text)
17
- _tokens = Array.new
18
- _index = 0;
19
- while (_index<_text.length)
20
- _char = String(_text[_index])
21
- if (_char.match(/\s/))
22
- _index = _index+1
23
- elsif (_char.match(/\w/))
24
- _buf = StringIO.new("")
25
- while ((_index < _text.length) && (_text[_index].match(/\w/)))
26
- _buf << _text[_index]
27
- _index += 1
16
+ text = tranform(text)
17
+ tokens = Array.new
18
+ index = 0
19
+ while (index < text.length)
20
+ char = text[index]
21
+ case char
22
+ when /\s/
23
+ index = index + 1
24
+ when /\w/ #/(?<word>\w+)/
25
+ buf = ""
26
+ while ((index < text.length) && (text[index].match(/\w/)))
27
+ buf << text[index]
28
+ index += 1
28
29
  end
29
- _tokens.push(_buf.string)
30
+ tokens.push buf
31
+ index += 1
30
32
  else
31
- if (!@doIgnorePunctuation)
32
- _buf = StringIO.new("")
33
- _buf << _char
34
- _tokens.push(_buf.string)
35
- end
36
- _index += 1
33
+ tokens.push(char) unless @doIgnorePunctuation
34
+ index += 1
37
35
  end
38
36
  end
39
- return _tokens
37
+ tokens
40
38
  end
41
39
 
42
40
  private
43
41
  def tranform(text)
44
- if (@doIgnoreCase)
45
- return text.to_s.downcase
46
- end
47
- return text.to_s
42
+ @doIgnoreCase ? text.to_s.downcase : text.to_s
48
43
  end
49
44
  end
50
45
  end
@@ -0,0 +1,32 @@
1
+ module Company
2
+ module Mapping
3
+ class CompanyCorpus < Corpus
4
+ def initialize(path=nil)
5
+ super()
6
+ import_csv path if path
7
+ end
8
+
9
+ # build a corpus from a csv file
10
+ def import_csv path
11
+ CSV.foreach(path) do |row|
12
+ array = row.first.split(";")
13
+
14
+ push doc(array[1], array.first)
15
+ array[2..-1].each_with_index do |company_alias, i|
16
+ push doc(company_alias, "#{array.first}_#{i}")
17
+ end
18
+ end
19
+ @corpus
20
+ end
21
+
22
+ private
23
+
24
+ def doc content, id
25
+ alias_doc = TextDocument.new
26
+ alias_doc.contents = content.gsub(",", "").gsub(".", "")
27
+ alias_doc.id = id
28
+ alias_doc
29
+ end
30
+ end
31
+ end
32
+ end
@@ -1,26 +1,7 @@
1
1
  module Company
2
-
3
2
  module Mapping
4
-
5
- class Corpus
6
-
7
- def initialize
8
- @corpus = Set.new
9
- end
10
-
11
- def push(document)
12
- @corpus.add(document)
13
- end
14
-
15
- def size
16
- return @corpus.size
17
- end
18
-
19
- def each
20
- @corpus.each do |doc|
21
- yield(doc)
22
- end
23
- end
3
+ class Corpus < Set
4
+ alias_method :push, :add
24
5
  end
25
6
  end
26
- end
7
+ end
@@ -1,6 +1,6 @@
1
1
  module Company
2
2
  module Mapping
3
-
3
+ # A simple text document
4
4
  class TextDocument
5
5
  attr_accessor :id, :contents, :tokenizer
6
6
 
@@ -22,10 +22,6 @@ module Company
22
22
  o.class == self.class && o.state == self.state
23
23
  end
24
24
 
25
- def info
26
- return "A simple text document"
27
- end
28
-
29
25
  def to_s
30
26
  "TextDocument:{#{id},#{contents}}"
31
27
  end
@@ -35,6 +31,5 @@ module Company
35
31
  [@id]
36
32
  end
37
33
  end
38
-
39
34
  end
40
35
  end
@@ -4,7 +4,6 @@ module Company
4
4
  #InverseDocumentFrequency consists the basic implementation of inverse document frequency. It is the logarithmically
5
5
  #scaled inverse fraction of the documents that contain the token, obtained by dividing the total number of documents by
6
6
  #the number of documents containing the token, and then taking the logarithm of that quotient.
7
-
8
7
  class InverseDocumentFrequency
9
8
 
10
9
  def initialize(corpus)
@@ -13,50 +12,25 @@ module Company
13
12
 
14
13
  #Calculates the basic Inverse Document Frequency of each token contained in a corpus of documents.
15
14
  def calculate
16
- _df = document_frequency
17
- _idf = Hash.new
18
-
19
-
20
- _df.each do |word, freq|
21
- _idf[word] = Math.log(@corpus.size/freq)
15
+ document_frequency.each_with_object({}) do |(word, freq), idf|
16
+ idf[word] = Math.log(@corpus.size/freq)
22
17
  end
23
- return _idf
24
- end
25
-
26
- def info
27
- "The inverse document frequency is a measure of how much "
28
- +"information the word provides, that is, whether the term is "
29
- +"common or rare across all documents of a corpus. It is the logarithmically "
30
- +"scaled inverse fraction of the documents that contain the token,"
31
- +" obtained by dividing the total number of documents by the number "
32
- +"of documents containing the token, and then taking the logarithm "
33
- +"of that quotient."
34
18
  end
35
19
 
36
20
  def maxIDF
37
- return Math.log(@corpus.size * 1.0)
21
+ Math.log(@corpus.size * 1.0)
38
22
  end
39
23
 
40
24
  protected
41
25
 
42
26
  #calculates the number of document occurrences of unique tokens within a corpus
43
27
  def document_frequency
44
- _df = Hash.new
45
-
46
- @corpus.each do |doc|
47
- _words = doc.bag_of_words.keys
48
-
49
- _words.each do |word|
50
- if (_df.has_key?(word))
51
- _df[word] = _df[word]+1.0
52
- else
53
- _df[word] = 1.0
54
- end
28
+ @corpus.each_with_object({}) do |doc, df|
29
+ doc.bag_of_words.keys.each do |word|
30
+ df[word] = (df.fetch(word) { 0.0 }) + 1.0
55
31
  end
56
32
  end
57
- return _df
58
33
  end
59
34
  end
60
-
61
35
  end
62
36
  end
@@ -6,22 +6,11 @@ module Company
6
6
  # f(t,d) is zero.
7
7
 
8
8
  class NormalizedTermFrequency < TermFrequency
9
-
10
9
  def calculate(text)
11
- _rawTF = rawFrequency(text)
12
- _logTF = Hash.new
13
-
14
- _rawTF.each do |key, value|
15
- _logTF[key] = 1.0 + Math.log(value)
10
+ rawFrequency(text).each_with_object({}) do |(key, value), logTF|
11
+ logTF[key] = 1.0 + Math.log(value)
16
12
  end
17
-
18
- return _logTF
19
- end
20
-
21
- def info
22
- return "Logarithmically scaled term frequency: tf(t,d) = 1 + log(f(t,d)), or zero if ft,d is zero";
23
13
  end
24
14
  end
25
-
26
15
  end
27
16
  end
@@ -1,6 +1,6 @@
1
1
  module Company
2
2
  module Mapping
3
-
3
+ # Raw term frequency (number of times a token appears in a given string - document)
4
4
  class TermFrequency
5
5
 
6
6
  def initialize(tokenizer)
@@ -9,29 +9,16 @@ module Company
9
9
 
10
10
  #Calculates the raw term frequency given the contents of the document.
11
11
  def calculate(text)
12
- return rawFrequency(text)
13
- end
14
-
15
- def info
16
- return "Raw term frequency (number of times a token appears in a given string - document)"
12
+ rawFrequency(text)
17
13
  end
18
14
 
19
15
  protected
20
16
  def rawFrequency(contents)
21
- _tokens = @tokenizer.tokenize(contents)
22
- _tf = Hash.new
23
-
24
- _tokens.each {
25
- |_token|
26
- if (!_tf.has_key?(_token))
27
- _tf[_token] = 1
28
- else
29
- _tf[_token] = _tf[_token] + 1
30
- end
31
- }
32
- return _tf
17
+ @tokenizer.tokenize(contents).each_with_object({}) do |token, tf|
18
+ tf[token] ||= 0
19
+ tf[token] += 1
20
+ end
33
21
  end
34
22
  end
35
-
36
23
  end
37
24
  end
@@ -1,7 +1,8 @@
1
1
  module Company
2
2
  module Mapping
3
3
 
4
- #TFIDF class implements Term Frequency Inverse Document Frequency statistic.
4
+ #TFIDF class implements Term Frequency Inverse Document Frequency statistic. Term frequency–inverse document frequency,
5
+ # is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.
5
6
  class TFIDF
6
7
  attr_accessor :tf, :idf
7
8
 
@@ -13,67 +14,38 @@ module Company
13
14
  def calculate
14
15
  @tfidf = Hash.new
15
16
 
16
- if (@idf==nil)
17
- @idf = InverseDocumentFrequency.new(@corpus)
18
- end
19
-
20
- if (@tf==nil)
21
- _tokenizer = BasicTokenizer.new
22
- @tf = NormalizedTermFrequency.new(_tokenizer)
23
- end
24
-
17
+ @idf ||= InverseDocumentFrequency.new(@corpus)
18
+ @tf ||= NormalizedTermFrequency.new(BasicTokenizer.new)
25
19
  @idf_weights = @idf.calculate
26
20
 
27
- @corpus.each {
28
- |doc|
29
-
30
- _termfreq = @tf.calculate(doc.contents)
31
-
32
- _tfidf_weights = Hash.new
33
-
34
- _termfreq.each do |term, tf|
35
- _weight = tf * @idf_weights[term]
36
- _tfidf_weights[term] = _weight
37
- end
21
+ @corpus.each do |doc|
22
+ termfreq = @tf.calculate(doc.contents)
38
23
 
39
- @tfidf[doc.id] = _tfidf_weights
40
- }
41
- return @tfidf
24
+ @tfidf[doc.id] =
25
+ termfreq.each_with_object({}) do |(term, tf), tfidf_weights|
26
+ weight = tf * @idf_weights[term]
27
+ tfidf_weights[term] = weight
28
+ end
29
+ end
30
+ @tfidf
42
31
  end
43
32
 
44
33
  #Calculates tfidf weights of new incoming document without importing the document in the corpus and re-calculating the tf-idf weights for the entire corpus
45
34
  def calculate_tfidf_weights_of_new_document(new_doc)
46
- _termfreq = @tf.calculate(new_doc.contents)
47
-
48
- _tfidf_weights = Hash.new
35
+ termfreq = @tf.calculate(new_doc.contents)
49
36
 
50
- _termfreq.each do |term, tf|
51
- if (@idf_weights.has_key? term)
52
- _weight = tf * @idf_weights[term]
53
- else
54
- _weight = tf * @idf.maxIDF
55
- end
56
- _tfidf_weights[term] = _weight
37
+ @tfidf[new_doc.id] = termfreq.each_with_object({}) do |(term, tf), tfidf_weights|
38
+ weight = tf * (@idf_weights[term] || @idf.maxIDF)
39
+ tfidf_weights[term] = weight
57
40
  end
58
- @tfidf[new_doc.id] = _tfidf_weights
59
- return @tfidf
41
+ @tfidf
60
42
  end
61
43
 
62
44
  #Calculates tf-idf similarity between two given documents. It is actually
63
45
  #the calculated Cosine Similarity by using tf*idf weights.
64
46
  def similarity(doc1_id, doc2_id)
65
- if (@tfidf==nil)
66
- calculate
67
- end
68
-
69
- _cosine_similarity = CosineSimilarity.new
70
- return _cosine_similarity.calculate(@tfidf[doc1_id], @tfidf[doc2_id])
71
- end
72
-
73
- def info
74
- " term frequency–inverse document frequency, is a numerical "
75
- +"statistic that is intended to reflect how important a word "
76
- +"is to a document in a collection or corpus"
47
+ @tfidf ||= calculate
48
+ CosineSimilarity.new.calculate(@tfidf[doc1_id], @tfidf[doc2_id])
77
49
  end
78
50
  end
79
51