similarity_tree 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/README.md +4 -10
- data/lib/similarity_tree/similarity_matrix.rb +3 -2
- data/lib/similarity_tree/version.rb +1 -1
- data/similarity_tree.gemspec +2 -1
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
ZDMwNDI5ODQ4ZTU2NDhiYjM4NmViMDM2NTlhYTA1OGU2MzI4Yjk2Ng==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
ZDM2ZmI2OTY4M2Q4MTlkODQ0Yjg1NTlmMmFiMWQ2OTY5ZDE0MTk5NA==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
YjhlZmM2Y2M0MDFhYjcwODU0OWJkZDE4MmM3NGViOTVkNzg2NzI0MDNiOTVl
|
10
|
+
MzRjMzk2YTY2YzI5NTJjNDE1YjEwMmY0YjM5ZTJjNThkYjI4MDZiYzZkODcx
|
11
|
+
OWEzZTU4MDgxODAyOGVjNWFjZjE1NTgxMTgxYjdiOWNhMjNiOTk=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
NmUxNmVjOTMyMTdiYTU0Njk2MjY2YzAwNjI4NmUzMDQ4NzM5MTJlNDdmODUw
|
14
|
+
NjM3YTY2ZTkwZDRiYzc1ZWI2MGU1NTJjZGVhZjI3YjA0NTM4ZWY4YzI2Yzc1
|
15
|
+
NmJjMmM2NmFiZDZkNDVlODYyZmUxZjk0NTZkNDgyZjkwOTEwNzA=
|
data/README.md
CHANGED
@@ -43,27 +43,22 @@ You can operate directly on **strings** rather than files (in this case, the nod
|
|
43
43
|
tree = SimilarityTree::SimilarityMatrix.new(documents).build_tree("CC-BY-3.0.html")
|
44
44
|
put tree.to_s # to_h and to_json are also available as other tree output formats
|
45
45
|
|
46
|
-
CC-BY-3.0.html
|
47
|
-
-CC-BY-NC-3.0.html (0.9197574893009985)
|
48
|
-
--CC-BY-NC-SA-3.0.html (0.9503146737330241)
|
49
|
-
--CC-BY-NC-ND-3.0.html (0.9456402772710689)
|
50
|
-
-CC-BY-ND-3.0.html (0.9434472109631346)
|
51
|
-
|
52
46
|
Result:
|
47
|
+
|
53
48
|
0
|
54
49
|
-1 (0.9197574893009985)
|
55
50
|
--3 (0.9503146737330241)
|
56
51
|
--4 (0.9456402772710689)
|
57
52
|
-2 (0.9434472109631346)
|
58
53
|
|
59
|
-
Or, you can use any **enumerable list of objects** (eg. ActiveRecords) as the inputs. Consider:
|
54
|
+
Or, you can use any **enumerable list of objects** (eg. ActiveRecords) as the inputs. Consider the model:
|
60
55
|
|
61
56
|
class Document < ActiveRecord::Base
|
62
57
|
attr_accessible :title, :text_filename
|
63
58
|
...
|
64
59
|
end
|
65
60
|
|
66
|
-
|
61
|
+
Generate the tree as follows:
|
67
62
|
|
68
63
|
tree = SimilarityTree::SimilarityMatrix.new(Document.all,
|
69
64
|
id_func: :title, content_func: :text_filename).build_tree(Document.first.title)
|
@@ -74,10 +69,9 @@ You can call:
|
|
74
69
|
|
75
70
|
You can use either the **term frequency–inverse document frequency** (:tf_idf, the default) or **Dice's coefficient** from a
|
76
71
|
standard unix-style diff to calculate the diff scores. Tf-idf works much better where a document has a lot of translations
|
77
|
-
(that is, "cut and pastes" of sections of text into different locations) and is
|
72
|
+
(that is, "cut and pastes" of sections of text into different locations) and is often faster. However, if your intent
|
78
73
|
is to show diffs of the text, the :diff option will correlate better to your diff rendering.
|
79
74
|
|
80
|
-
documents = Dir.glob('../../similarity_tree/test/cc_licences/*.html')
|
81
75
|
tf_idf_tree = SimilarityTree::SimilarityMatrix.new(documents,
|
82
76
|
calculation_method: :tf_idf).build_tree("CC-BY-3.0.html")
|
83
77
|
diff_tree = SimilarityTree::SimilarityMatrix.new(documents,
|
@@ -1,6 +1,7 @@
|
|
1
|
-
require '
|
1
|
+
require 'gsl'
|
2
2
|
require 'tf-idf-similarity'
|
3
3
|
require 'fast_html_diff'
|
4
|
+
require 'ruby-progressbar'
|
4
5
|
|
5
6
|
module SimilarityTree
|
6
7
|
# Table of the diff/similarity scores between different text documents
|
@@ -62,7 +63,7 @@ module SimilarityTree
|
|
62
63
|
progress_bar.increment unless progress_bar.nil?
|
63
64
|
id_of(source)
|
64
65
|
end
|
65
|
-
model = TfIdfSimilarity::TfIdfModel.new(corpus, function: :tf_idf)
|
66
|
+
model = TfIdfSimilarity::TfIdfModel.new(corpus, function: :tf_idf, library: :gsl)
|
66
67
|
similarity_matrix = model.similarity_matrix
|
67
68
|
|
68
69
|
# compile the results into an ordinary m*n array
|
data/similarity_tree.gemspec
CHANGED
@@ -20,7 +20,8 @@ Gem::Specification.new do |spec|
|
|
20
20
|
|
21
21
|
spec.add_development_dependency "bundler", "~> 1.3"
|
22
22
|
spec.add_development_dependency "rake"
|
23
|
-
spec.add_runtime_dependency "fast_html_diff"
|
23
|
+
spec.add_runtime_dependency "fast_html_diff", "~> 0.8.1"
|
24
|
+
spec.add_runtime_dependency "gsl"
|
24
25
|
spec.add_runtime_dependency "tf-idf-similarity"
|
25
26
|
spec.add_runtime_dependency "ruby-progressbar"
|
26
27
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: similarity_tree
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kent Mewhort
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-07-
|
11
|
+
date: 2013-07-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -40,6 +40,20 @@ dependencies:
|
|
40
40
|
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: fast_html_diff
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ~>
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 0.8.1
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ~>
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 0.8.1
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: gsl
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
44
58
|
requirements:
|
45
59
|
- - ! '>='
|