tf-idf-similarity 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +1 -8
- data/Gemfile +2 -2
- data/README.md +40 -9
- data/lib/tf-idf-similarity.rb +1 -0
- data/lib/tf-idf-similarity/bm25_model.rb +23 -62
- data/lib/tf-idf-similarity/document.rb +69 -67
- data/lib/tf-idf-similarity/extras/document.rb +10 -8
- data/lib/tf-idf-similarity/extras/tf_idf_model.rb +157 -155
- data/lib/tf-idf-similarity/matrix_methods.rb +137 -135
- data/lib/tf-idf-similarity/model.rb +66 -0
- data/lib/tf-idf-similarity/term_count_model.rb +59 -57
- data/lib/tf-idf-similarity/tf_idf_model.rb +21 -60
- data/lib/tf-idf-similarity/token.rb +39 -37
- data/lib/tf-idf-similarity/version.rb +1 -1
- data/spec/bm25_model_spec.rb +200 -0
- data/spec/document_spec.rb +98 -96
- data/spec/extras/tf_idf_model_spec.rb +224 -222
- data/spec/spec_helper.rb +6 -0
- data/spec/term_count_model_spec.rb +76 -74
- data/spec/tf_idf_model_spec.rb +143 -117
- data/spec/token_spec.rb +23 -21
- metadata +6 -2
data/spec/token_spec.rb
CHANGED
@@ -1,34 +1,36 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
require 'spec_helper'
|
3
3
|
|
4
|
-
|
5
|
-
describe
|
6
|
-
|
7
|
-
|
8
|
-
|
4
|
+
module TfIdfSimilarity
|
5
|
+
describe Token do
|
6
|
+
describe '#valid?' do
|
7
|
+
it 'should return false if all of its characters are numbers, punctuation or whitespace characters' do
|
8
|
+
Token.new('1 2 3 ! @ #').valid?.should == false
|
9
|
+
end
|
9
10
|
|
10
|
-
|
11
|
-
|
11
|
+
it 'should return true if not all of its characters are numbers, punctuation or whitespace characters' do
|
12
|
+
Token.new('1 2 3 ! @ # a').valid?.should == true
|
13
|
+
end
|
12
14
|
end
|
13
|
-
end
|
14
15
|
|
15
|
-
|
16
|
-
|
17
|
-
|
16
|
+
describe '#lowercase_filter' do
|
17
|
+
it 'should lowercase the token' do
|
18
|
+
Token.new('HÉTÉROGÉNÉITÉ').lowercase_filter.should == 'hétérogénéité'
|
19
|
+
end
|
18
20
|
end
|
19
|
-
end
|
20
21
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
22
|
+
describe '#classic_filter' do
|
23
|
+
it 'should remove all periods' do
|
24
|
+
Token.new('X.Y.Z.').classic_filter.should == 'XYZ'
|
25
|
+
end
|
25
26
|
|
26
|
-
|
27
|
-
|
28
|
-
|
27
|
+
it 'should remove ending possessives' do
|
28
|
+
Token.new("foo's").classic_filter.should == 'foo'
|
29
|
+
end
|
29
30
|
|
30
|
-
|
31
|
-
|
31
|
+
it 'should not remove infix possessives' do
|
32
|
+
Token.new("foo's bar").classic_filter.should == "foo's bar"
|
33
|
+
end
|
32
34
|
end
|
33
35
|
end
|
34
36
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tf-idf-similarity
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Open North
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-10-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -89,10 +89,12 @@ files:
|
|
89
89
|
- lib/tf-idf-similarity/extras/document.rb
|
90
90
|
- lib/tf-idf-similarity/extras/tf_idf_model.rb
|
91
91
|
- lib/tf-idf-similarity/matrix_methods.rb
|
92
|
+
- lib/tf-idf-similarity/model.rb
|
92
93
|
- lib/tf-idf-similarity/term_count_model.rb
|
93
94
|
- lib/tf-idf-similarity/tf_idf_model.rb
|
94
95
|
- lib/tf-idf-similarity/token.rb
|
95
96
|
- lib/tf-idf-similarity/version.rb
|
97
|
+
- spec/bm25_model_spec.rb
|
96
98
|
- spec/document_spec.rb
|
97
99
|
- spec/extras/tf_idf_model_spec.rb
|
98
100
|
- spec/spec_helper.rb
|
@@ -125,9 +127,11 @@ signing_key:
|
|
125
127
|
specification_version: 4
|
126
128
|
summary: Calculates the similarity between texts using tf*idf
|
127
129
|
test_files:
|
130
|
+
- spec/bm25_model_spec.rb
|
128
131
|
- spec/document_spec.rb
|
129
132
|
- spec/extras/tf_idf_model_spec.rb
|
130
133
|
- spec/spec_helper.rb
|
131
134
|
- spec/term_count_model_spec.rb
|
132
135
|
- spec/tf_idf_model_spec.rb
|
133
136
|
- spec/token_spec.rb
|
137
|
+
has_rdoc:
|