company-mapping 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.idea/company-mapping.iml +28 -0
- data/.idea/misc.xml +4 -0
- data/.idea/modules.xml +8 -0
- data/.idea/workspace.xml +777 -0
- data/.travis.yml +5 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +41 -0
- data/Rakefile +10 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/company-mapping.gemspec +27 -0
- data/lib/company/mapping.rb +18 -0
- data/lib/company/mapping/company_mapper.rb +42 -0
- data/lib/company/mapping/document_utils/basic_tokenizer.rb +51 -0
- data/lib/company/mapping/document_utils/corpus.rb +26 -0
- data/lib/company/mapping/document_utils/text_document.rb +40 -0
- data/lib/company/mapping/tfidf/idf/inverse_document_frequency.rb +62 -0
- data/lib/company/mapping/tfidf/tf/normalized_term_frequency.rb +27 -0
- data/lib/company/mapping/tfidf/tf/term_frequency.rb +37 -0
- data/lib/company/mapping/tfidf/tfidf.rb +81 -0
- data/lib/company/mapping/vector_similarity/cosine_similarity.rb +47 -0
- data/lib/company/mapping/version.rb +5 -0
- metadata +113 -0
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
module Company
|
|
2
|
+
module Mapping
|
|
3
|
+
|
|
4
|
+
# NormalizedTermFrequency inherits from TermFrequency class. It calculates Term Frequency as
|
|
5
|
+
# logarithmically scaled frequency: tf(t,d) = 1 + log(f(t,d)), or zero if
|
|
6
|
+
# f(t,d) is zero.
|
|
7
|
+
|
|
8
|
+
class NormalizedTermFrequency < TermFrequency
|
|
9
|
+
|
|
10
|
+
def calculate(text)
|
|
11
|
+
_rawTF = rawFrequency(text)
|
|
12
|
+
_logTF = Hash.new
|
|
13
|
+
|
|
14
|
+
_rawTF.each do |key, value|
|
|
15
|
+
_logTF[key] = 1.0 + Math.log(value)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
return _logTF
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def info
|
|
22
|
+
return "Logarithmically scaled term frequency: tf(t,d) = 1 + log(f(t,d)), or zero if ft,d is zero";
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
end
|
|
27
|
+
end
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
module Company
|
|
2
|
+
module Mapping
|
|
3
|
+
|
|
4
|
+
class TermFrequency
|
|
5
|
+
|
|
6
|
+
def initialize(tokenizer)
|
|
7
|
+
@tokenizer = tokenizer
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
#Calculates the raw term frequency given the contents of the document.
|
|
11
|
+
def calculate(text)
|
|
12
|
+
return rawFrequency(text)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def info
|
|
16
|
+
return "Raw term frequency (number of times a token appears in a given string - document)"
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
protected
|
|
20
|
+
def rawFrequency(contents)
|
|
21
|
+
_tokens = @tokenizer.tokenize(contents)
|
|
22
|
+
_tf = Hash.new
|
|
23
|
+
|
|
24
|
+
_tokens.each {
|
|
25
|
+
|_token|
|
|
26
|
+
if (!_tf.has_key?(_token))
|
|
27
|
+
_tf[_token] = 1
|
|
28
|
+
else
|
|
29
|
+
_tf[_token] = _tf[_token] + 1
|
|
30
|
+
end
|
|
31
|
+
}
|
|
32
|
+
return _tf
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
end
|
|
37
|
+
end
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
module Company
|
|
2
|
+
module Mapping
|
|
3
|
+
|
|
4
|
+
#TFIDF class implements Term Frequency Inverse Document Frequency statistic.
|
|
5
|
+
class TFIDF
|
|
6
|
+
attr_accessor :tf, :idf
|
|
7
|
+
|
|
8
|
+
def initialize(corpus)
|
|
9
|
+
@corpus = corpus
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
#Calculates the tf-idf weights in the given corpus
|
|
13
|
+
def calculate
|
|
14
|
+
@tfidf = Hash.new
|
|
15
|
+
|
|
16
|
+
if (@idf==nil)
|
|
17
|
+
@idf = InverseDocumentFrequency.new(@corpus)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
if (@tf==nil)
|
|
21
|
+
_tokenizer = BasicTokenizer.new
|
|
22
|
+
@tf = NormalizedTermFrequency.new(_tokenizer)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
@idf_weights = @idf.calculate
|
|
26
|
+
|
|
27
|
+
@corpus.each {
|
|
28
|
+
|doc|
|
|
29
|
+
|
|
30
|
+
_termfreq = @tf.calculate(doc.contents)
|
|
31
|
+
|
|
32
|
+
_tfidf_weights = Hash.new
|
|
33
|
+
|
|
34
|
+
_termfreq.each do |term, tf|
|
|
35
|
+
_weight = tf * @idf_weights[term]
|
|
36
|
+
_tfidf_weights[term] = _weight
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
@tfidf[doc.id] = _tfidf_weights
|
|
40
|
+
}
|
|
41
|
+
return @tfidf
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
#Calculates tfidf weights of new incoming document without importing the document in the corpus and re-calculating the tf-idf weights for the entire corpus
|
|
45
|
+
def calculate_tfidf_weights_of_new_document(new_doc)
|
|
46
|
+
_termfreq = @tf.calculate(new_doc.contents)
|
|
47
|
+
|
|
48
|
+
_tfidf_weights = Hash.new
|
|
49
|
+
|
|
50
|
+
_termfreq.each do |term, tf|
|
|
51
|
+
if (@idf_weights.has_key? term)
|
|
52
|
+
_weight = tf * @idf_weights[term]
|
|
53
|
+
else
|
|
54
|
+
_weight = tf * @idf.maxIDF
|
|
55
|
+
end
|
|
56
|
+
_tfidf_weights[term] = _weight
|
|
57
|
+
end
|
|
58
|
+
@tfidf[new_doc.id] = _tfidf_weights
|
|
59
|
+
return @tfidf
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
#Calculates tf-idf similarity between two given documents. It is actually
|
|
63
|
+
#the calculated Cosine Similarity by using tf*idf weights.
|
|
64
|
+
def similarity(doc1_id, doc2_id)
|
|
65
|
+
if (@tfidf==nil)
|
|
66
|
+
calculate
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
_cosine_similarity = CosineSimilarity.new
|
|
70
|
+
return _cosine_similarity.calculate(@tfidf[doc1_id], @tfidf[doc2_id])
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def info
|
|
74
|
+
" term frequency–inverse document frequency, is a numerical "
|
|
75
|
+
+"statistic that is intended to reflect how important a word "
|
|
76
|
+
+"is to a document in a collection or corpus"
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
end
|
|
81
|
+
end
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
module Company
|
|
2
|
+
module Mapping
|
|
3
|
+
|
|
4
|
+
#Implements Cosine Similarity between two non zero vectors and it measures the cosine of the angle between them.
|
|
5
|
+
class CosineSimilarity
|
|
6
|
+
|
|
7
|
+
#Calculates cosine similarity between two documents. The documents are expressed as vectors of tokens (bag of words model).
|
|
8
|
+
def calculate(doc1, doc2)
|
|
9
|
+
return (dotProduct(doc1, doc2) / (Math.sqrt(d(doc1)) * Math.sqrt(d(doc2)))).round(4)
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def info
|
|
13
|
+
"Implements Cosine Similarity between two non zero vectors and it measures the cosine of the angle between them.\n"
|
|
14
|
+
+"More info about Cosine Similarity can be found: https://en.wikipedia.org/wiki/Cosine_similarity";
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
protected
|
|
18
|
+
#Calculated the dot product between the two document vectors. The dot product is an algebraic operation
|
|
19
|
+
# that takes two equal-length sequences of numbers (usually coordinate vectors) and returns a single number.
|
|
20
|
+
def dotProduct(doc1, doc2)
|
|
21
|
+
_common_tokens = findCommonTokens(doc1.keys, doc2.keys)
|
|
22
|
+
_dot_product = 0.0
|
|
23
|
+
|
|
24
|
+
_common_tokens.each do |token|
|
|
25
|
+
_dot_product += doc2[token] * doc1[token]
|
|
26
|
+
end
|
|
27
|
+
return _dot_product
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
#Calculates the magnitude of a vector document
|
|
31
|
+
def d(doc)
|
|
32
|
+
_d = 0.0
|
|
33
|
+
doc.keys.each do |term|
|
|
34
|
+
_d += doc[term]**2.0
|
|
35
|
+
end
|
|
36
|
+
return _d
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
#returns the set of common tokens between two document vectors
|
|
40
|
+
def findCommonTokens(doc1_tokens, doc2_tokens)
|
|
41
|
+
common_tokens = Set.new doc1_tokens
|
|
42
|
+
return common_tokens.intersection(Set.new doc2_tokens)
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
end
|
|
47
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: company-mapping
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- vasgat
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: exe
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2017-03-21 00:00:00.000000000 Z
|
|
12
|
+
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: bundler
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - "~>"
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '1.14'
|
|
20
|
+
type: :development
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - "~>"
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '1.14'
|
|
27
|
+
- !ruby/object:Gem::Dependency
|
|
28
|
+
name: rake
|
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
|
30
|
+
requirements:
|
|
31
|
+
- - "~>"
|
|
32
|
+
- !ruby/object:Gem::Version
|
|
33
|
+
version: '10.0'
|
|
34
|
+
type: :development
|
|
35
|
+
prerelease: false
|
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
37
|
+
requirements:
|
|
38
|
+
- - "~>"
|
|
39
|
+
- !ruby/object:Gem::Version
|
|
40
|
+
version: '10.0'
|
|
41
|
+
- !ruby/object:Gem::Dependency
|
|
42
|
+
name: minitest
|
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
|
44
|
+
requirements:
|
|
45
|
+
- - "~>"
|
|
46
|
+
- !ruby/object:Gem::Version
|
|
47
|
+
version: '5.0'
|
|
48
|
+
type: :development
|
|
49
|
+
prerelease: false
|
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
51
|
+
requirements:
|
|
52
|
+
- - "~>"
|
|
53
|
+
- !ruby/object:Gem::Version
|
|
54
|
+
version: '5.0'
|
|
55
|
+
description: Given a Corpus of WikiRate Company Names and a new (incoming) Company
|
|
56
|
+
Name, CompanyMapper class, finds the closest match if exists based on the calculated
|
|
57
|
+
tf-idf similarity and a defined threshold.
|
|
58
|
+
email:
|
|
59
|
+
- vasgat@gmail.com
|
|
60
|
+
executables: []
|
|
61
|
+
extensions: []
|
|
62
|
+
extra_rdoc_files: []
|
|
63
|
+
files:
|
|
64
|
+
- ".gitignore"
|
|
65
|
+
- ".idea/company-mapping.iml"
|
|
66
|
+
- ".idea/misc.xml"
|
|
67
|
+
- ".idea/modules.xml"
|
|
68
|
+
- ".idea/workspace.xml"
|
|
69
|
+
- ".travis.yml"
|
|
70
|
+
- CODE_OF_CONDUCT.md
|
|
71
|
+
- Gemfile
|
|
72
|
+
- LICENSE.txt
|
|
73
|
+
- README.md
|
|
74
|
+
- Rakefile
|
|
75
|
+
- bin/console
|
|
76
|
+
- bin/setup
|
|
77
|
+
- company-mapping.gemspec
|
|
78
|
+
- lib/company/mapping.rb
|
|
79
|
+
- lib/company/mapping/company_mapper.rb
|
|
80
|
+
- lib/company/mapping/document_utils/basic_tokenizer.rb
|
|
81
|
+
- lib/company/mapping/document_utils/corpus.rb
|
|
82
|
+
- lib/company/mapping/document_utils/text_document.rb
|
|
83
|
+
- lib/company/mapping/tfidf/idf/inverse_document_frequency.rb
|
|
84
|
+
- lib/company/mapping/tfidf/tf/normalized_term_frequency.rb
|
|
85
|
+
- lib/company/mapping/tfidf/tf/term_frequency.rb
|
|
86
|
+
- lib/company/mapping/tfidf/tfidf.rb
|
|
87
|
+
- lib/company/mapping/vector_similarity/cosine_similarity.rb
|
|
88
|
+
- lib/company/mapping/version.rb
|
|
89
|
+
homepage: https://github.com/vasgat/company-mapping
|
|
90
|
+
licenses:
|
|
91
|
+
- MIT
|
|
92
|
+
metadata: {}
|
|
93
|
+
post_install_message:
|
|
94
|
+
rdoc_options: []
|
|
95
|
+
require_paths:
|
|
96
|
+
- lib
|
|
97
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
98
|
+
requirements:
|
|
99
|
+
- - ">="
|
|
100
|
+
- !ruby/object:Gem::Version
|
|
101
|
+
version: '0'
|
|
102
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
103
|
+
requirements:
|
|
104
|
+
- - ">="
|
|
105
|
+
- !ruby/object:Gem::Version
|
|
106
|
+
version: '0'
|
|
107
|
+
requirements: []
|
|
108
|
+
rubyforge_project:
|
|
109
|
+
rubygems_version: 2.6.11
|
|
110
|
+
signing_key:
|
|
111
|
+
specification_version: 4
|
|
112
|
+
summary: Maps new companies with those in a given corpus.
|
|
113
|
+
test_files: []
|