similarityTextCoefficients 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 4deed5d254b448b0f72730d1d7259e3f6fa44b41
4
+ data.tar.gz: 526d8ee307d479d0a93121d76430b53b16be8cfc
5
+ SHA512:
6
+ metadata.gz: eb7649ab9ba6cd423555dc12f108d7c29d08b66fd60b0df83f1dda4448cd5a762b319fc895cce9cd791e943e1364e829bef5e0e668095057e734bf08afaf0e2b
7
+ data.tar.gz: 1daafe71aaa86253a61fdd09a3bd428fec4c8f9cdd510b25db6e88bc5dfd06cfcbdf5d4c8bcef326a3114823ba70136bf64a6e9267406b988e2f8bf125b8380a
@@ -0,0 +1,98 @@
1
+ require "set"
2
+
3
+ class SimilarityTextCoefficients
4
+
5
+ def self.dice_coefficient(a, b)
6
+
7
+ vectorOne = Array.new
8
+ vectorTwo = Array.new
9
+
10
+ vectorOne=a.split(" ")
11
+
12
+ vectorTwo=b.split(" ")
13
+ newdice = 0
14
+
15
+ for x in 0.. vectorTwo.length-1
16
+
17
+ for y in 0.. vectorOne.length-1
18
+
19
+ a_bigrams = vectorOne[y].each_char.each_cons(2).to_set
20
+ b_bigrams = vectorTwo[x].each_char.each_cons(2).to_set
21
+
22
+ overlap = (a_bigrams & b_bigrams).size
23
+
24
+ total = a_bigrams.size + b_bigrams.size
25
+ dice = overlap * 2.0 / total
26
+
27
+ if newdice == 0
28
+
29
+ newdice=dice
30
+
31
+ end
32
+
33
+ if newdice < dice
34
+
35
+ newdice=dice
36
+
37
+ end
38
+
39
+ end
40
+
41
+ end
42
+
43
+ newdice
44
+
45
+ end
46
+
47
+
48
+ #Index of Jaccard =intersection/ ((numberA+numberB) -intersection )
49
+ #where
50
+ #numberA - number of elements in set А
51
+ #numberB- number of elements in set B
52
+ #intersection - number of elements in intersecting set
53
+
54
+ def self.createJaccardIndex(a,b)
55
+
56
+ vectorOne = Array.new
57
+ vectorTwo = Array.new
58
+ intersection=0
59
+
60
+ contvector=0
61
+
62
+ # We get number of itens for each group and put in them in a array One and Two
63
+
64
+ vectorOne=a.split(" ")
65
+
66
+ vectorTwo=b.split(" ")
67
+
68
+ if vectorOne.length < vectorTwo.length
69
+
70
+ numberA= vectorOne.length
71
+ numberB= vectorTwo.length
72
+
73
+ else
74
+ numberB= vectorOne.length
75
+ numberA= vectorTwo.length
76
+ end
77
+
78
+ numberA= vectorOne.length
79
+ numberB= vectorTwo.length
80
+
81
+ # We compare each iten inside of each array to obtain: number of elements in intersecting set
82
+
83
+ for x in 0.. vectorTwo.length-1
84
+
85
+ if ((vectorOne).include?(vectorTwo[x]) == true)
86
+ intersection+=1
87
+ end
88
+
89
+
90
+ end
91
+
92
+ indexJaccard= intersection.to_f/ ((numberA+numberB) -intersection )
93
+ return indexJaccard
94
+
95
+ end
96
+
97
+
98
+ end
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: similarityTextCoefficients
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Gilberto Flores
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-04-27 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Enter two strings and it compares their similarity and gives a score
14
+ between 0 and 1, when 1 is the similarity
15
+ email: gilbertofp16@gmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/similarityTextCoefficients.rb
21
+ homepage: http://rubygems.org/gems/SimilarityTextCoefficients
22
+ licenses:
23
+ - MIT
24
+ metadata: {}
25
+ post_install_message:
26
+ rdoc_options: []
27
+ require_paths:
28
+ - lib
29
+ required_ruby_version: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ requirements: []
40
+ rubyforge_project:
41
+ rubygems_version: 2.4.6
42
+ signing_key:
43
+ specification_version: 4
44
+ summary: For text comparations it implements Dice's coefficient and Jaccard's or Tanimoto
45
+ index
46
+ test_files: []
47
+ has_rdoc: