tf-idf-similarity 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +1 -1
- data/lib/tf-idf-similarity.rb +0 -2
- data/lib/tf-idf-similarity/collection.rb +13 -0
- data/lib/tf-idf-similarity/document.rb +8 -1
- data/lib/tf-idf-similarity/version.rb +1 -1
- metadata +3 -8
data/LICENSE
CHANGED
@@ -17,4 +17,4 @@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
17
17
|
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
18
|
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
19
|
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
-
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/lib/tf-idf-similarity.rb
CHANGED
@@ -42,6 +42,7 @@ class TfIdfSimilarity::Collection
|
|
42
42
|
# @see http://en.wikipedia.org/wiki/Vector_space_model
|
43
43
|
# @see http://en.wikipedia.org/wiki/Document-term_matrix
|
44
44
|
# @see http://en.wikipedia.org/wiki/Cosine_similarity
|
45
|
+
# @see http://en.wikipedia.org/wiki/Okapi_BM25
|
45
46
|
def similarity_matrix(opts = {})
|
46
47
|
if stdlib?
|
47
48
|
idf = []
|
@@ -78,6 +79,18 @@ class TfIdfSimilarity::Collection
|
|
78
79
|
end
|
79
80
|
end
|
80
81
|
|
82
|
+
# @param [Document] document a document
|
83
|
+
# @param [String] term a term
|
84
|
+
# @param [Hash] opts optional arguments
|
85
|
+
# @option opts [Symbol] :function one of :tfidf (default) or :bm25
|
86
|
+
# @return [Float] the term's frequency in the document
|
87
|
+
#
|
88
|
+
# @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
|
89
|
+
def term_frequency_inverse_document_frequency(document, term, opts = {})
|
90
|
+
inverse_document_frequency(term, opts) * term_frequency(document, term, opts)
|
91
|
+
end
|
92
|
+
alias_method :tfidf, :term_frequency_inverse_document_frequency
|
93
|
+
|
81
94
|
# @param [String] term a term
|
82
95
|
# @param [Hash] opts optional arguments
|
83
96
|
# @option opts [Symbol] :function one of :tfidf (default) or :bm25
|
@@ -6,6 +6,8 @@ class TfIdfSimilarity::Document
|
|
6
6
|
attr_reader :id
|
7
7
|
# The document's text.
|
8
8
|
attr_reader :text
|
9
|
+
# The document's tokenized text.
|
10
|
+
attr_reader :tokens
|
9
11
|
# The number of times each term appears in the document.
|
10
12
|
attr_reader :term_counts
|
11
13
|
# The document size, in terms.
|
@@ -14,9 +16,11 @@ class TfIdfSimilarity::Document
|
|
14
16
|
# @param [String] text the document's text
|
15
17
|
# @param [Hash] opts optional arguments
|
16
18
|
# @option opts [String] :id a string to identify the document
|
19
|
+
# @option opts [Array] :tokens the document's tokenized text
|
17
20
|
def initialize(text, opts = {})
|
18
21
|
@text = text
|
19
22
|
@id = opts[:id] || object_id
|
23
|
+
@tokens = opts[:tokens]
|
20
24
|
@term_counts = Hash.new 0
|
21
25
|
process
|
22
26
|
end
|
@@ -51,6 +55,9 @@ private
|
|
51
55
|
# Tokenizes a text, respecting the word boundary rules from Unicode’s Default
|
52
56
|
# Word Boundary Specification.
|
53
57
|
#
|
58
|
+
# If a tokenized text was provided at the document's initialization, those
|
59
|
+
# tokens will be returned without additional processing.
|
60
|
+
#
|
54
61
|
# @param [String] text a text
|
55
62
|
# @return [Enumerator] a token enumerator
|
56
63
|
#
|
@@ -60,6 +67,6 @@ private
|
|
60
67
|
# @see http://unicode.org/reports/tr29/#Default_Word_Boundaries
|
61
68
|
# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StandardTokenizerFactory
|
62
69
|
def tokenize(text)
|
63
|
-
UnicodeUtils.each_word
|
70
|
+
@tokens || UnicodeUtils.each_word(text)
|
64
71
|
end
|
65
72
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tf-idf-similarity
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-10-29 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: unicode_utils
|
@@ -93,18 +93,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
93
93
|
- - ! '>='
|
94
94
|
- !ruby/object:Gem::Version
|
95
95
|
version: '0'
|
96
|
-
segments:
|
97
|
-
- 0
|
98
|
-
hash: -4125970683092216956
|
99
96
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
100
97
|
none: false
|
101
98
|
requirements:
|
102
99
|
- - ! '>='
|
103
100
|
- !ruby/object:Gem::Version
|
104
101
|
version: '0'
|
105
|
-
segments:
|
106
|
-
- 0
|
107
|
-
hash: -4125970683092216956
|
108
102
|
requirements: []
|
109
103
|
rubyforge_project:
|
110
104
|
rubygems_version: 1.8.24
|
@@ -112,3 +106,4 @@ signing_key:
|
|
112
106
|
specification_version: 3
|
113
107
|
summary: Calculates the similarity between texts using tf*idf
|
114
108
|
test_files: []
|
109
|
+
has_rdoc:
|