company-mapping 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,27 @@
1
+ module Company
2
+ module Mapping
3
+
4
+ # NormalizedTermFrequency inherits from TermFrequency class. It calculates Term Frequency as
5
+ # logarithmically scaled frequency: tf(t,d) = 1 + log(f(t,d)), or zero if
6
+ # f(t,d) is zero.
7
+
8
+ class NormalizedTermFrequency < TermFrequency
9
+
10
+ def calculate(text)
11
+ _rawTF = rawFrequency(text)
12
+ _logTF = Hash.new
13
+
14
+ _rawTF.each do |key, value|
15
+ _logTF[key] = 1.0 + Math.log(value)
16
+ end
17
+
18
+ return _logTF
19
+ end
20
+
21
+ def info
22
+ return "Logarithmically scaled term frequency: tf(t,d) = 1 + log(f(t,d)), or zero if ft,d is zero";
23
+ end
24
+ end
25
+
26
+ end
27
+ end
@@ -0,0 +1,37 @@
1
+ module Company
2
+ module Mapping
3
+
4
+ class TermFrequency
5
+
6
+ def initialize(tokenizer)
7
+ @tokenizer = tokenizer
8
+ end
9
+
10
+ #Calculates the raw term frequency given the contents of the document.
11
+ def calculate(text)
12
+ return rawFrequency(text)
13
+ end
14
+
15
+ def info
16
+ return "Raw term frequency (number of times a token appears in a given string - document)"
17
+ end
18
+
19
+ protected
20
+ def rawFrequency(contents)
21
+ _tokens = @tokenizer.tokenize(contents)
22
+ _tf = Hash.new
23
+
24
+ _tokens.each {
25
+ |_token|
26
+ if (!_tf.has_key?(_token))
27
+ _tf[_token] = 1
28
+ else
29
+ _tf[_token] = _tf[_token] + 1
30
+ end
31
+ }
32
+ return _tf
33
+ end
34
+ end
35
+
36
+ end
37
+ end
@@ -0,0 +1,81 @@
1
+ module Company
2
+ module Mapping
3
+
4
+ #TFIDF class implements Term Frequency Inverse Document Frequency statistic.
5
+ class TFIDF
6
+ attr_accessor :tf, :idf
7
+
8
+ def initialize(corpus)
9
+ @corpus = corpus
10
+ end
11
+
12
+ #Calculates the tf-idf weights in the given corpus
13
+ def calculate
14
+ @tfidf = Hash.new
15
+
16
+ if (@idf==nil)
17
+ @idf = InverseDocumentFrequency.new(@corpus)
18
+ end
19
+
20
+ if (@tf==nil)
21
+ _tokenizer = BasicTokenizer.new
22
+ @tf = NormalizedTermFrequency.new(_tokenizer)
23
+ end
24
+
25
+ @idf_weights = @idf.calculate
26
+
27
+ @corpus.each {
28
+ |doc|
29
+
30
+ _termfreq = @tf.calculate(doc.contents)
31
+
32
+ _tfidf_weights = Hash.new
33
+
34
+ _termfreq.each do |term, tf|
35
+ _weight = tf * @idf_weights[term]
36
+ _tfidf_weights[term] = _weight
37
+ end
38
+
39
+ @tfidf[doc.id] = _tfidf_weights
40
+ }
41
+ return @tfidf
42
+ end
43
+
44
+ #Calculates tfidf weights of new incoming document without importing the document in the corpus and re-calculating the tf-idf weights for the entire corpus
45
+ def calculate_tfidf_weights_of_new_document(new_doc)
46
+ _termfreq = @tf.calculate(new_doc.contents)
47
+
48
+ _tfidf_weights = Hash.new
49
+
50
+ _termfreq.each do |term, tf|
51
+ if (@idf_weights.has_key? term)
52
+ _weight = tf * @idf_weights[term]
53
+ else
54
+ _weight = tf * @idf.maxIDF
55
+ end
56
+ _tfidf_weights[term] = _weight
57
+ end
58
+ @tfidf[new_doc.id] = _tfidf_weights
59
+ return @tfidf
60
+ end
61
+
62
+ #Calculates tf-idf similarity between two given documents. It is actually
63
+ #the calculated Cosine Similarity by using tf*idf weights.
64
+ def similarity(doc1_id, doc2_id)
65
+ if (@tfidf==nil)
66
+ calculate
67
+ end
68
+
69
+ _cosine_similarity = CosineSimilarity.new
70
+ return _cosine_similarity.calculate(@tfidf[doc1_id], @tfidf[doc2_id])
71
+ end
72
+
73
+ def info
74
+ " term frequency–inverse document frequency, is a numerical "
75
+ +"statistic that is intended to reflect how important a word "
76
+ +"is to a document in a collection or corpus"
77
+ end
78
+ end
79
+
80
+ end
81
+ end
@@ -0,0 +1,47 @@
1
+ module Company
2
+ module Mapping
3
+
4
+ #Implements Cosine Similarity between two non zero vectors and it measures the cosine of the angle between them.
5
+ class CosineSimilarity
6
+
7
+ #Calculates cosine similarity between two documents. The documents are expressed as vectors of tokens (bag of words model).
8
+ def calculate(doc1, doc2)
9
+ return (dotProduct(doc1, doc2) / (Math.sqrt(d(doc1)) * Math.sqrt(d(doc2)))).round(4)
10
+ end
11
+
12
+ def info
13
+ "Implements Cosine Similarity between two non zero vectors and it measures the cosine of the angle between them.\n"
14
+ +"More info about Cosine Similarity can be found: https://en.wikipedia.org/wiki/Cosine_similarity";
15
+ end
16
+
17
+ protected
18
+ #Calculated the dot product between the two document vectors. The dot product is an algebraic operation
19
+ # that takes two equal-length sequences of numbers (usually coordinate vectors) and returns a single number.
20
+ def dotProduct(doc1, doc2)
21
+ _common_tokens = findCommonTokens(doc1.keys, doc2.keys)
22
+ _dot_product = 0.0
23
+
24
+ _common_tokens.each do |token|
25
+ _dot_product += doc2[token] * doc1[token]
26
+ end
27
+ return _dot_product
28
+ end
29
+
30
+ #Calculates the magnitude of a vector document
31
+ def d(doc)
32
+ _d = 0.0
33
+ doc.keys.each do |term|
34
+ _d += doc[term]**2.0
35
+ end
36
+ return _d
37
+ end
38
+
39
+ #returns the set of common tokens between two document vectors
40
+ def findCommonTokens(doc1_tokens, doc2_tokens)
41
+ common_tokens = Set.new doc1_tokens
42
+ return common_tokens.intersection(Set.new doc2_tokens)
43
+ end
44
+ end
45
+
46
+ end
47
+ end
@@ -0,0 +1,5 @@
1
+ module Company
2
+ module Mapping
3
+ VERSION = "0.1.0"
4
+ end
5
+ end
metadata ADDED
@@ -0,0 +1,113 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: company-mapping
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - vasgat
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2017-03-21 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.14'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.14'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: minitest
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '5.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '5.0'
55
+ description: Given a Corpus of WikiRate Company Names and a new (incoming) Company
56
+ Name, CompanyMapper class, finds the closest match if exists based on the calculated
57
+ tf-idf similarity and a defined threshold.
58
+ email:
59
+ - vasgat@gmail.com
60
+ executables: []
61
+ extensions: []
62
+ extra_rdoc_files: []
63
+ files:
64
+ - ".gitignore"
65
+ - ".idea/company-mapping.iml"
66
+ - ".idea/misc.xml"
67
+ - ".idea/modules.xml"
68
+ - ".idea/workspace.xml"
69
+ - ".travis.yml"
70
+ - CODE_OF_CONDUCT.md
71
+ - Gemfile
72
+ - LICENSE.txt
73
+ - README.md
74
+ - Rakefile
75
+ - bin/console
76
+ - bin/setup
77
+ - company-mapping.gemspec
78
+ - lib/company/mapping.rb
79
+ - lib/company/mapping/company_mapper.rb
80
+ - lib/company/mapping/document_utils/basic_tokenizer.rb
81
+ - lib/company/mapping/document_utils/corpus.rb
82
+ - lib/company/mapping/document_utils/text_document.rb
83
+ - lib/company/mapping/tfidf/idf/inverse_document_frequency.rb
84
+ - lib/company/mapping/tfidf/tf/normalized_term_frequency.rb
85
+ - lib/company/mapping/tfidf/tf/term_frequency.rb
86
+ - lib/company/mapping/tfidf/tfidf.rb
87
+ - lib/company/mapping/vector_similarity/cosine_similarity.rb
88
+ - lib/company/mapping/version.rb
89
+ homepage: https://github.com/vasgat/company-mapping
90
+ licenses:
91
+ - MIT
92
+ metadata: {}
93
+ post_install_message:
94
+ rdoc_options: []
95
+ require_paths:
96
+ - lib
97
+ required_ruby_version: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ required_rubygems_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ requirements: []
108
+ rubyforge_project:
109
+ rubygems_version: 2.6.11
110
+ signing_key:
111
+ specification_version: 4
112
+ summary: Maps new companies with those in a given corpus.
113
+ test_files: []