tf-idf-similarity 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE CHANGED
@@ -17,4 +17,4 @@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
17
  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
18
  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
19
  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -1,5 +1,3 @@
1
- $LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__))) unless $LOAD_PATH.include?(File.expand_path(File.dirname(__FILE__)))
2
-
3
1
  module TfIdfSimilarity
4
2
  autoload :Collection, 'tf-idf-similarity/collection'
5
3
  autoload :Document, 'tf-idf-similarity/document'
@@ -42,6 +42,7 @@ class TfIdfSimilarity::Collection
42
42
  # @see http://en.wikipedia.org/wiki/Vector_space_model
43
43
  # @see http://en.wikipedia.org/wiki/Document-term_matrix
44
44
  # @see http://en.wikipedia.org/wiki/Cosine_similarity
45
+ # @see http://en.wikipedia.org/wiki/Okapi_BM25
45
46
  def similarity_matrix(opts = {})
46
47
  if stdlib?
47
48
  idf = []
@@ -78,6 +79,18 @@ class TfIdfSimilarity::Collection
78
79
  end
79
80
  end
80
81
 
82
+ # @param [Document] document a document
83
+ # @param [String] term a term
84
+ # @param [Hash] opts optional arguments
85
+ # @option opts [Symbol] :function one of :tfidf (default) or :bm25
86
+ # @return [Float] the term's frequency in the document
87
+ #
88
+ # @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
89
+ def term_frequency_inverse_document_frequency(document, term, opts = {})
90
+ inverse_document_frequency(term, opts) * term_frequency(document, term, opts)
91
+ end
92
+ alias_method :tfidf, :term_frequency_inverse_document_frequency
93
+
81
94
  # @param [String] term a term
82
95
  # @param [Hash] opts optional arguments
83
96
  # @option opts [Symbol] :function one of :tfidf (default) or :bm25
@@ -6,6 +6,8 @@ class TfIdfSimilarity::Document
6
6
  attr_reader :id
7
7
  # The document's text.
8
8
  attr_reader :text
9
+ # The document's tokenized text.
10
+ attr_reader :tokens
9
11
  # The number of times each term appears in the document.
10
12
  attr_reader :term_counts
11
13
  # The document size, in terms.
@@ -14,9 +16,11 @@ class TfIdfSimilarity::Document
14
16
  # @param [String] text the document's text
15
17
  # @param [Hash] opts optional arguments
16
18
  # @option opts [String] :id a string to identify the document
19
+ # @option opts [Array] :tokens the document's tokenized text
17
20
  def initialize(text, opts = {})
18
21
  @text = text
19
22
  @id = opts[:id] || object_id
23
+ @tokens = opts[:tokens]
20
24
  @term_counts = Hash.new 0
21
25
  process
22
26
  end
@@ -51,6 +55,9 @@ private
51
55
  # Tokenizes a text, respecting the word boundary rules from Unicode’s Default
52
56
  # Word Boundary Specification.
53
57
  #
58
+ # If a tokenized text was provided at the document's initialization, those
59
+ # tokens will be returned without additional processing.
60
+ #
54
61
  # @param [String] text a text
55
62
  # @return [Enumerator] a token enumerator
56
63
  #
@@ -60,6 +67,6 @@ private
60
67
  # @see http://unicode.org/reports/tr29/#Default_Word_Boundaries
61
68
  # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StandardTokenizerFactory
62
69
  def tokenize(text)
63
- UnicodeUtils.each_word text
70
+ @tokens || UnicodeUtils.each_word(text)
64
71
  end
65
72
  end
@@ -1,3 +1,3 @@
1
1
  module TfIdfSimilarity
2
- VERSION = "0.0.3"
2
+ VERSION = "0.0.4"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tf-idf-similarity
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-11 00:00:00.000000000 Z
12
+ date: 2012-10-29 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: unicode_utils
@@ -93,18 +93,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
93
93
  - - ! '>='
94
94
  - !ruby/object:Gem::Version
95
95
  version: '0'
96
- segments:
97
- - 0
98
- hash: -4125970683092216956
99
96
  required_rubygems_version: !ruby/object:Gem::Requirement
100
97
  none: false
101
98
  requirements:
102
99
  - - ! '>='
103
100
  - !ruby/object:Gem::Version
104
101
  version: '0'
105
- segments:
106
- - 0
107
- hash: -4125970683092216956
108
102
  requirements: []
109
103
  rubyforge_project:
110
104
  rubygems_version: 1.8.24
@@ -112,3 +106,4 @@ signing_key:
112
106
  specification_version: 3
113
107
  summary: Calculates the similarity between texts using tf*idf
114
108
  test_files: []
109
+ has_rdoc: