company-mapping 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,47 +1,35 @@
1
1
  module Company
2
2
  module Mapping
3
3
 
4
- #Implements Cosine Similarity between two non zero vectors and it measures the cosine of the angle between them.
4
+ #Implements Cosine Similarity between two non zero vectors and it measures the cosine of the angle between them.
5
5
  class CosineSimilarity
6
6
 
7
7
  #Calculates cosine similarity between two documents. The documents are expressed as vectors of tokens (bag of words model).
8
8
  def calculate(doc1, doc2)
9
- return (dotProduct(doc1, doc2) / (Math.sqrt(d(doc1)) * Math.sqrt(d(doc2)))).round(4)
10
- end
11
-
12
- def info
13
- "Implements Cosine Similarity between two non zero vectors and it measures the cosine of the angle between them.\n"
14
- +"More info about Cosine Similarity can be found: https://en.wikipedia.org/wiki/Cosine_similarity";
9
+ (dotProduct(doc1, doc2) / (Math.sqrt(d(doc1)) * Math.sqrt(d(doc2)))).round(4)
15
10
  end
16
11
 
17
12
  protected
18
13
  #Calculated the dot product between the two document vectors. The dot product is an algebraic operation
19
14
  # that takes two equal-length sequences of numbers (usually coordinate vectors) and returns a single number.
20
15
  def dotProduct(doc1, doc2)
21
- _common_tokens = findCommonTokens(doc1.keys, doc2.keys)
22
- _dot_product = 0.0
23
-
24
- _common_tokens.each do |token|
25
- _dot_product += doc2[token] * doc1[token]
16
+ common_tokens(doc1.keys, doc2.keys).inject(0.0) do |dot_product, token|
17
+ dot_product + doc2[token] * doc1[token]
26
18
  end
27
- return _dot_product
28
19
  end
29
20
 
30
21
  #Calculates the magnitude of a vector document
31
22
  def d(doc)
32
- _d = 0.0
33
- doc.keys.each do |term|
34
- _d += doc[term]**2.0
23
+ doc.keys.inject(0.0) do |d, term|
24
+ d + doc[term]**2.0
35
25
  end
36
- return _d
37
26
  end
38
27
 
39
28
  #returns the set of common tokens between two document vectors
40
- def findCommonTokens(doc1_tokens, doc2_tokens)
29
+ def common_tokens(doc1_tokens, doc2_tokens)
41
30
  common_tokens = Set.new doc1_tokens
42
- return common_tokens.intersection(Set.new doc2_tokens)
31
+ common_tokens.intersection(Set.new doc2_tokens)
43
32
  end
44
33
  end
45
-
46
34
  end
47
- end
35
+ end
@@ -1,5 +1,5 @@
1
1
  module Company
2
2
  module Mapping
3
- VERSION = "0.1.0"
3
+ VERSION = "0.2.0"
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: company-mapping
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - vasgat
@@ -39,19 +39,19 @@ dependencies:
39
39
  - !ruby/object:Gem::Version
40
40
  version: '10.0'
41
41
  - !ruby/object:Gem::Dependency
42
- name: minitest
42
+ name: rspec
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '5.0'
47
+ version: '3.5'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '5.0'
54
+ version: '3.5'
55
55
  description: Given a Corpus of WikiRate Company Names and a new (incoming) Company
56
56
  Name, CompanyMapper class, finds the closest match if exists based on the calculated
57
57
  tf-idf similarity and a defined threshold.
@@ -78,6 +78,7 @@ files:
78
78
  - lib/company/mapping.rb
79
79
  - lib/company/mapping/company_mapper.rb
80
80
  - lib/company/mapping/document_utils/basic_tokenizer.rb
81
+ - lib/company/mapping/document_utils/company_corpus.rb
81
82
  - lib/company/mapping/document_utils/corpus.rb
82
83
  - lib/company/mapping/document_utils/text_document.rb
83
84
  - lib/company/mapping/tfidf/idf/inverse_document_frequency.rb