similarityTextCoefficients 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/similarityTextCoefficients.rb +98 -0
- metadata +47 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 4deed5d254b448b0f72730d1d7259e3f6fa44b41
|
4
|
+
data.tar.gz: 526d8ee307d479d0a93121d76430b53b16be8cfc
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: eb7649ab9ba6cd423555dc12f108d7c29d08b66fd60b0df83f1dda4448cd5a762b319fc895cce9cd791e943e1364e829bef5e0e668095057e734bf08afaf0e2b
|
7
|
+
data.tar.gz: 1daafe71aaa86253a61fdd09a3bd428fec4c8f9cdd510b25db6e88bc5dfd06cfcbdf5d4c8bcef326a3114823ba70136bf64a6e9267406b988e2f8bf125b8380a
|
@@ -0,0 +1,98 @@
|
|
1
|
+
require "set"
|
2
|
+
|
3
|
+
class SimilarityTextCoefficients
|
4
|
+
|
5
|
+
def self.dice_coefficient(a, b)
|
6
|
+
|
7
|
+
vectorOne = Array.new
|
8
|
+
vectorTwo = Array.new
|
9
|
+
|
10
|
+
vectorOne=a.split(" ")
|
11
|
+
|
12
|
+
vectorTwo=b.split(" ")
|
13
|
+
newdice = 0
|
14
|
+
|
15
|
+
for x in 0.. vectorTwo.length-1
|
16
|
+
|
17
|
+
for y in 0.. vectorOne.length-1
|
18
|
+
|
19
|
+
a_bigrams = vectorOne[y].each_char.each_cons(2).to_set
|
20
|
+
b_bigrams = vectorTwo[x].each_char.each_cons(2).to_set
|
21
|
+
|
22
|
+
overlap = (a_bigrams & b_bigrams).size
|
23
|
+
|
24
|
+
total = a_bigrams.size + b_bigrams.size
|
25
|
+
dice = overlap * 2.0 / total
|
26
|
+
|
27
|
+
if newdice == 0
|
28
|
+
|
29
|
+
newdice=dice
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
if newdice < dice
|
34
|
+
|
35
|
+
newdice=dice
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
|
43
|
+
newdice
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
#Index of Jaccard =intersection/ ((numberA+numberB) -intersection )
|
49
|
+
#where
|
50
|
+
#numberA - number of elements in set А
|
51
|
+
#numberB- number of elements in set B
|
52
|
+
#intersection - number of elements in intersecting set
|
53
|
+
|
54
|
+
def self.createJaccardIndex(a,b)
|
55
|
+
|
56
|
+
vectorOne = Array.new
|
57
|
+
vectorTwo = Array.new
|
58
|
+
intersection=0
|
59
|
+
|
60
|
+
contvector=0
|
61
|
+
|
62
|
+
# We get number of itens for each group and put in them in a array One and Two
|
63
|
+
|
64
|
+
vectorOne=a.split(" ")
|
65
|
+
|
66
|
+
vectorTwo=b.split(" ")
|
67
|
+
|
68
|
+
if vectorOne.length < vectorTwo.length
|
69
|
+
|
70
|
+
numberA= vectorOne.length
|
71
|
+
numberB= vectorTwo.length
|
72
|
+
|
73
|
+
else
|
74
|
+
numberB= vectorOne.length
|
75
|
+
numberA= vectorTwo.length
|
76
|
+
end
|
77
|
+
|
78
|
+
numberA= vectorOne.length
|
79
|
+
numberB= vectorTwo.length
|
80
|
+
|
81
|
+
# We compare each iten inside of each array to obtain: number of elements in intersecting set
|
82
|
+
|
83
|
+
for x in 0.. vectorTwo.length-1
|
84
|
+
|
85
|
+
if ((vectorOne).include?(vectorTwo[x]) == true)
|
86
|
+
intersection+=1
|
87
|
+
end
|
88
|
+
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
indexJaccard= intersection.to_f/ ((numberA+numberB) -intersection )
|
93
|
+
return indexJaccard
|
94
|
+
|
95
|
+
end
|
96
|
+
|
97
|
+
|
98
|
+
end
|
metadata
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: similarityTextCoefficients
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Gilberto Flores
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-04-27 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Enter two strings and it compares their similarity and gives a score
|
14
|
+
between 0 and 1, when 1 is the similarity
|
15
|
+
email: gilbertofp16@gmail.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- lib/similarityTextCoefficients.rb
|
21
|
+
homepage: http://rubygems.org/gems/SimilarityTextCoefficients
|
22
|
+
licenses:
|
23
|
+
- MIT
|
24
|
+
metadata: {}
|
25
|
+
post_install_message:
|
26
|
+
rdoc_options: []
|
27
|
+
require_paths:
|
28
|
+
- lib
|
29
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ">="
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
requirements: []
|
40
|
+
rubyforge_project:
|
41
|
+
rubygems_version: 2.4.6
|
42
|
+
signing_key:
|
43
|
+
specification_version: 4
|
44
|
+
summary: For text comparations it implements Dice's coefficient and Jaccard's or Tanimoto
|
45
|
+
index
|
46
|
+
test_files: []
|
47
|
+
has_rdoc:
|