similarity_tree 1.0.0 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/README.md +4 -10
- data/lib/similarity_tree/similarity_matrix.rb +3 -2
- data/lib/similarity_tree/version.rb +1 -1
- data/similarity_tree.gemspec +2 -1
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
ZDMwNDI5ODQ4ZTU2NDhiYjM4NmViMDM2NTlhYTA1OGU2MzI4Yjk2Ng==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
ZDM2ZmI2OTY4M2Q4MTlkODQ0Yjg1NTlmMmFiMWQ2OTY5ZDE0MTk5NA==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
YjhlZmM2Y2M0MDFhYjcwODU0OWJkZDE4MmM3NGViOTVkNzg2NzI0MDNiOTVl
|
10
|
+
MzRjMzk2YTY2YzI5NTJjNDE1YjEwMmY0YjM5ZTJjNThkYjI4MDZiYzZkODcx
|
11
|
+
OWEzZTU4MDgxODAyOGVjNWFjZjE1NTgxMTgxYjdiOWNhMjNiOTk=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
NmUxNmVjOTMyMTdiYTU0Njk2MjY2YzAwNjI4NmUzMDQ4NzM5MTJlNDdmODUw
|
14
|
+
NjM3YTY2ZTkwZDRiYzc1ZWI2MGU1NTJjZGVhZjI3YjA0NTM4ZWY4YzI2Yzc1
|
15
|
+
NmJjMmM2NmFiZDZkNDVlODYyZmUxZjk0NTZkNDgyZjkwOTEwNzA=
|
data/README.md
CHANGED
@@ -43,27 +43,22 @@ You can operate directly on **strings** rather than files (in this case, the nod
|
|
43
43
|
tree = SimilarityTree::SimilarityMatrix.new(documents).build_tree("CC-BY-3.0.html")
|
44
44
|
put tree.to_s # to_h and to_json are also available as other tree output formats
|
45
45
|
|
46
|
-
CC-BY-3.0.html
|
47
|
-
-CC-BY-NC-3.0.html (0.9197574893009985)
|
48
|
-
--CC-BY-NC-SA-3.0.html (0.9503146737330241)
|
49
|
-
--CC-BY-NC-ND-3.0.html (0.9456402772710689)
|
50
|
-
-CC-BY-ND-3.0.html (0.9434472109631346)
|
51
|
-
|
52
46
|
Result:
|
47
|
+
|
53
48
|
0
|
54
49
|
-1 (0.9197574893009985)
|
55
50
|
--3 (0.9503146737330241)
|
56
51
|
--4 (0.9456402772710689)
|
57
52
|
-2 (0.9434472109631346)
|
58
53
|
|
59
|
-
Or, you can use any **enumerable list of objects** (eg. ActiveRecords) as the inputs. Consider:
|
54
|
+
Or, you can use any **enumerable list of objects** (eg. ActiveRecords) as the inputs. Consider the model:
|
60
55
|
|
61
56
|
class Document < ActiveRecord::Base
|
62
57
|
attr_accessible :title, :text_filename
|
63
58
|
...
|
64
59
|
end
|
65
60
|
|
66
|
-
|
61
|
+
Generate the tree as follows:
|
67
62
|
|
68
63
|
tree = SimilarityTree::SimilarityMatrix.new(Document.all,
|
69
64
|
id_func: :title, content_func: :text_filename).build_tree(Document.first.title)
|
@@ -74,10 +69,9 @@ You can call:
|
|
74
69
|
|
75
70
|
You can use either the **term frequency–inverse document frequency** (:tf_idf, the default) or **Dice's coefficient** from a
|
76
71
|
standard unix-style diff to calculate the diff scores. Tf-idf works much better where a document has a lot of translations
|
77
|
-
(that is, "cut and pastes" of sections of text into different locations) and is
|
72
|
+
(that is, "cut and pastes" of sections of text into different locations) and is often faster. However, if your intent
|
78
73
|
is to show diffs of the text, the :diff option will correlate better to your diff rendering.
|
79
74
|
|
80
|
-
documents = Dir.glob('../../similarity_tree/test/cc_licences/*.html')
|
81
75
|
tf_idf_tree = SimilarityTree::SimilarityMatrix.new(documents,
|
82
76
|
calculation_method: :tf_idf).build_tree("CC-BY-3.0.html")
|
83
77
|
diff_tree = SimilarityTree::SimilarityMatrix.new(documents,
|
@@ -1,6 +1,7 @@
|
|
1
|
-
require '
|
1
|
+
require 'gsl'
|
2
2
|
require 'tf-idf-similarity'
|
3
3
|
require 'fast_html_diff'
|
4
|
+
require 'ruby-progressbar'
|
4
5
|
|
5
6
|
module SimilarityTree
|
6
7
|
# Table of the diff/similarity scores between different text documents
|
@@ -62,7 +63,7 @@ module SimilarityTree
|
|
62
63
|
progress_bar.increment unless progress_bar.nil?
|
63
64
|
id_of(source)
|
64
65
|
end
|
65
|
-
model = TfIdfSimilarity::TfIdfModel.new(corpus, function: :tf_idf)
|
66
|
+
model = TfIdfSimilarity::TfIdfModel.new(corpus, function: :tf_idf, library: :gsl)
|
66
67
|
similarity_matrix = model.similarity_matrix
|
67
68
|
|
68
69
|
# compile the results into an ordinary m*n array
|
data/similarity_tree.gemspec
CHANGED
@@ -20,7 +20,8 @@ Gem::Specification.new do |spec|
|
|
20
20
|
|
21
21
|
spec.add_development_dependency "bundler", "~> 1.3"
|
22
22
|
spec.add_development_dependency "rake"
|
23
|
-
spec.add_runtime_dependency "fast_html_diff"
|
23
|
+
spec.add_runtime_dependency "fast_html_diff", "~> 0.8.1"
|
24
|
+
spec.add_runtime_dependency "gsl"
|
24
25
|
spec.add_runtime_dependency "tf-idf-similarity"
|
25
26
|
spec.add_runtime_dependency "ruby-progressbar"
|
26
27
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: similarity_tree
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kent Mewhort
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-07-
|
11
|
+
date: 2013-07-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -40,6 +40,20 @@ dependencies:
|
|
40
40
|
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: fast_html_diff
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ~>
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 0.8.1
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ~>
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 0.8.1
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: gsl
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
44
58
|
requirements:
|
45
59
|
- - ! '>='
|