tf-idf-similarity 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +1 -1
- data/lib/tf-idf-similarity.rb +0 -2
- data/lib/tf-idf-similarity/collection.rb +13 -0
- data/lib/tf-idf-similarity/document.rb +8 -1
- data/lib/tf-idf-similarity/version.rb +1 -1
- metadata +3 -8
data/LICENSE
CHANGED
@@ -17,4 +17,4 @@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
17
17
|
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
18
|
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
19
|
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
-
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/lib/tf-idf-similarity.rb
CHANGED
@@ -42,6 +42,7 @@ class TfIdfSimilarity::Collection
|
|
42
42
|
# @see http://en.wikipedia.org/wiki/Vector_space_model
|
43
43
|
# @see http://en.wikipedia.org/wiki/Document-term_matrix
|
44
44
|
# @see http://en.wikipedia.org/wiki/Cosine_similarity
|
45
|
+
# @see http://en.wikipedia.org/wiki/Okapi_BM25
|
45
46
|
def similarity_matrix(opts = {})
|
46
47
|
if stdlib?
|
47
48
|
idf = []
|
@@ -78,6 +79,18 @@ class TfIdfSimilarity::Collection
|
|
78
79
|
end
|
79
80
|
end
|
80
81
|
|
82
|
+
# @param [Document] document a document
|
83
|
+
# @param [String] term a term
|
84
|
+
# @param [Hash] opts optional arguments
|
85
|
+
# @option opts [Symbol] :function one of :tfidf (default) or :bm25
|
86
|
+
# @return [Float] the term's frequency in the document
|
87
|
+
#
|
88
|
+
# @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
|
89
|
+
def term_frequency_inverse_document_frequency(document, term, opts = {})
|
90
|
+
inverse_document_frequency(term, opts) * term_frequency(document, term, opts)
|
91
|
+
end
|
92
|
+
alias_method :tfidf, :term_frequency_inverse_document_frequency
|
93
|
+
|
81
94
|
# @param [String] term a term
|
82
95
|
# @param [Hash] opts optional arguments
|
83
96
|
# @option opts [Symbol] :function one of :tfidf (default) or :bm25
|
@@ -6,6 +6,8 @@ class TfIdfSimilarity::Document
|
|
6
6
|
attr_reader :id
|
7
7
|
# The document's text.
|
8
8
|
attr_reader :text
|
9
|
+
# The document's tokenized text.
|
10
|
+
attr_reader :tokens
|
9
11
|
# The number of times each term appears in the document.
|
10
12
|
attr_reader :term_counts
|
11
13
|
# The document size, in terms.
|
@@ -14,9 +16,11 @@ class TfIdfSimilarity::Document
|
|
14
16
|
# @param [String] text the document's text
|
15
17
|
# @param [Hash] opts optional arguments
|
16
18
|
# @option opts [String] :id a string to identify the document
|
19
|
+
# @option opts [Array] :tokens the document's tokenized text
|
17
20
|
def initialize(text, opts = {})
|
18
21
|
@text = text
|
19
22
|
@id = opts[:id] || object_id
|
23
|
+
@tokens = opts[:tokens]
|
20
24
|
@term_counts = Hash.new 0
|
21
25
|
process
|
22
26
|
end
|
@@ -51,6 +55,9 @@ private
|
|
51
55
|
# Tokenizes a text, respecting the word boundary rules from Unicode’s Default
|
52
56
|
# Word Boundary Specification.
|
53
57
|
#
|
58
|
+
# If a tokenized text was provided at the document's initialization, those
|
59
|
+
# tokens will be returned without additional processing.
|
60
|
+
#
|
54
61
|
# @param [String] text a text
|
55
62
|
# @return [Enumerator] a token enumerator
|
56
63
|
#
|
@@ -60,6 +67,6 @@ private
|
|
60
67
|
# @see http://unicode.org/reports/tr29/#Default_Word_Boundaries
|
61
68
|
# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StandardTokenizerFactory
|
62
69
|
def tokenize(text)
|
63
|
-
UnicodeUtils.each_word
|
70
|
+
@tokens || UnicodeUtils.each_word(text)
|
64
71
|
end
|
65
72
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tf-idf-similarity
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-10-29 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: unicode_utils
|
@@ -93,18 +93,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
93
93
|
- - ! '>='
|
94
94
|
- !ruby/object:Gem::Version
|
95
95
|
version: '0'
|
96
|
-
segments:
|
97
|
-
- 0
|
98
|
-
hash: -4125970683092216956
|
99
96
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
100
97
|
none: false
|
101
98
|
requirements:
|
102
99
|
- - ! '>='
|
103
100
|
- !ruby/object:Gem::Version
|
104
101
|
version: '0'
|
105
|
-
segments:
|
106
|
-
- 0
|
107
|
-
hash: -4125970683092216956
|
108
102
|
requirements: []
|
109
103
|
rubyforge_project:
|
110
104
|
rubygems_version: 1.8.24
|
@@ -112,3 +106,4 @@ signing_key:
|
|
112
106
|
specification_version: 3
|
113
107
|
summary: Calculates the similarity between texts using tf*idf
|
114
108
|
test_files: []
|
109
|
+
has_rdoc:
|