company-mapping 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.idea/company-mapping.iml +35 -1
- data/.idea/workspace.xml +363 -105
- data/company-mapping.gemspec +27 -27
- data/lib/company/mapping.rb +20 -18
- data/lib/company/mapping/company_mapper.rb +10 -18
- data/lib/company/mapping/document_utils/basic_tokenizer.rb +19 -24
- data/lib/company/mapping/document_utils/company_corpus.rb +32 -0
- data/lib/company/mapping/document_utils/corpus.rb +3 -22
- data/lib/company/mapping/document_utils/text_document.rb +1 -6
- data/lib/company/mapping/tfidf/idf/inverse_document_frequency.rb +6 -32
- data/lib/company/mapping/tfidf/tf/normalized_term_frequency.rb +2 -13
- data/lib/company/mapping/tfidf/tf/term_frequency.rb +6 -19
- data/lib/company/mapping/tfidf/tfidf.rb +20 -48
- data/lib/company/mapping/vector_similarity/cosine_similarity.rb +9 -21
- data/lib/company/mapping/version.rb +1 -1
- metadata +5 -4
|
@@ -1,47 +1,35 @@
|
|
|
1
1
|
module Company
|
|
2
2
|
module Mapping
|
|
3
3
|
|
|
4
|
-
#Implements Cosine Similarity between two non zero vectors and it measures the cosine of the angle between them.
|
|
4
|
+
#Implements Cosine Similarity between two non zero vectors and it measures the cosine of the angle between them.
|
|
5
5
|
class CosineSimilarity
|
|
6
6
|
|
|
7
7
|
#Calculates cosine similarity between two documents. The documents are expressed as vectors of tokens (bag of words model).
|
|
8
8
|
def calculate(doc1, doc2)
|
|
9
|
-
|
|
10
|
-
end
|
|
11
|
-
|
|
12
|
-
def info
|
|
13
|
-
"Implements Cosine Similarity between two non zero vectors and it measures the cosine of the angle between them.\n"
|
|
14
|
-
+"More info about Cosine Similarity can be found: https://en.wikipedia.org/wiki/Cosine_similarity";
|
|
9
|
+
(dotProduct(doc1, doc2) / (Math.sqrt(d(doc1)) * Math.sqrt(d(doc2)))).round(4)
|
|
15
10
|
end
|
|
16
11
|
|
|
17
12
|
protected
|
|
18
13
|
#Calculated the dot product between the two document vectors. The dot product is an algebraic operation
|
|
19
14
|
# that takes two equal-length sequences of numbers (usually coordinate vectors) and returns a single number.
|
|
20
15
|
def dotProduct(doc1, doc2)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
_common_tokens.each do |token|
|
|
25
|
-
_dot_product += doc2[token] * doc1[token]
|
|
16
|
+
common_tokens(doc1.keys, doc2.keys).inject(0.0) do |dot_product, token|
|
|
17
|
+
dot_product + doc2[token] * doc1[token]
|
|
26
18
|
end
|
|
27
|
-
return _dot_product
|
|
28
19
|
end
|
|
29
20
|
|
|
30
21
|
#Calculates the magnitude of a vector document
|
|
31
22
|
def d(doc)
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
_d += doc[term]**2.0
|
|
23
|
+
doc.keys.inject(0.0) do |d, term|
|
|
24
|
+
d + doc[term]**2.0
|
|
35
25
|
end
|
|
36
|
-
return _d
|
|
37
26
|
end
|
|
38
27
|
|
|
39
28
|
#returns the set of common tokens between two document vectors
|
|
40
|
-
def
|
|
29
|
+
def common_tokens(doc1_tokens, doc2_tokens)
|
|
41
30
|
common_tokens = Set.new doc1_tokens
|
|
42
|
-
|
|
31
|
+
common_tokens.intersection(Set.new doc2_tokens)
|
|
43
32
|
end
|
|
44
33
|
end
|
|
45
|
-
|
|
46
34
|
end
|
|
47
|
-
end
|
|
35
|
+
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: company-mapping
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- vasgat
|
|
@@ -39,19 +39,19 @@ dependencies:
|
|
|
39
39
|
- !ruby/object:Gem::Version
|
|
40
40
|
version: '10.0'
|
|
41
41
|
- !ruby/object:Gem::Dependency
|
|
42
|
-
name:
|
|
42
|
+
name: rspec
|
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
|
44
44
|
requirements:
|
|
45
45
|
- - "~>"
|
|
46
46
|
- !ruby/object:Gem::Version
|
|
47
|
-
version: '5
|
|
47
|
+
version: '3.5'
|
|
48
48
|
type: :development
|
|
49
49
|
prerelease: false
|
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
|
51
51
|
requirements:
|
|
52
52
|
- - "~>"
|
|
53
53
|
- !ruby/object:Gem::Version
|
|
54
|
-
version: '5
|
|
54
|
+
version: '3.5'
|
|
55
55
|
description: Given a Corpus of WikiRate Company Names and a new (incoming) Company
|
|
56
56
|
Name, CompanyMapper class, finds the closest match if exists based on the calculated
|
|
57
57
|
tf-idf similarity and a defined threshold.
|
|
@@ -78,6 +78,7 @@ files:
|
|
|
78
78
|
- lib/company/mapping.rb
|
|
79
79
|
- lib/company/mapping/company_mapper.rb
|
|
80
80
|
- lib/company/mapping/document_utils/basic_tokenizer.rb
|
|
81
|
+
- lib/company/mapping/document_utils/company_corpus.rb
|
|
81
82
|
- lib/company/mapping/document_utils/corpus.rb
|
|
82
83
|
- lib/company/mapping/document_utils/text_document.rb
|
|
83
84
|
- lib/company/mapping/tfidf/idf/inverse_document_frequency.rb
|