RubyGems - tf-idf-similarity - Versions diffs - 0.0.3 → 0.0.4 - Mend

tf-idf-similarity 0.0.3 → 0.0.4

Files changed (6) hide show

data/LICENSE CHANGED

@@ -17,4 +17,4 @@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/lib/tf-idf-similarity.rb CHANGED

@@ -1,5 +1,3 @@
-$LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__))) unless $LOAD_PATH.include?(File.expand_path(File.dirname(__FILE__)))
 module TfIdfSimilarity
   autoload :Collection, 'tf-idf-similarity/collection'
   autoload :Document, 'tf-idf-similarity/document'

@@ -42,6 +42,7 @@ class TfIdfSimilarity::Collection
   # @see http://en.wikipedia.org/wiki/Vector_space_model
   # @see http://en.wikipedia.org/wiki/Document-term_matrix
   # @see http://en.wikipedia.org/wiki/Cosine_similarity
+  # @see http://en.wikipedia.org/wiki/Okapi_BM25
   def similarity_matrix(opts = {})
     if stdlib?
       idf = []
@@ -78,6 +79,18 @@ class TfIdfSimilarity::Collection
     end
   end
+  # @param [Document] document a document
+  # @param [String] term a term
+  # @param [Hash] opts optional arguments
+  # @option opts [Symbol] :function one of :tfidf (default) or :bm25
+  # @return [Float] the term's frequency in the document
+  #
+  # @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
+  def term_frequency_inverse_document_frequency(document, term, opts = {})
+    inverse_document_frequency(term, opts) * term_frequency(document, term, opts)
+  end
+  alias_method :tfidf, :term_frequency_inverse_document_frequency
   # @param [String] term a term
   # @param [Hash] opts optional arguments
   # @option opts [Symbol] :function one of :tfidf (default) or :bm25

@@ -6,6 +6,8 @@ class TfIdfSimilarity::Document
   attr_reader :id
   # The document's text.
   attr_reader :text
+  # The document's tokenized text.
+  attr_reader :tokens
   # The number of times each term appears in the document.
   attr_reader :term_counts
   # The document size, in terms.
@@ -14,9 +16,11 @@ class TfIdfSimilarity::Document
   # @param [String] text the document's text
   # @param [Hash] opts optional arguments
   # @option opts [String] :id a string to identify the document
+  # @option opts [Array] :tokens the document's tokenized text
   def initialize(text, opts = {})
     @text        = text
     @id          = opts[:id] || object_id
+    @tokens      = opts[:tokens]
     @term_counts = Hash.new 0
     process
   end
@@ -51,6 +55,9 @@ private
   # Tokenizes a text, respecting the word boundary rules from Unicode’s Default
   # Word Boundary Specification.
   #
+  # If a tokenized text was provided at the document's initialization, those
+  # tokens will be returned without additional processing.
+  #
   # @param [String] text a text
   # @return [Enumerator] a token enumerator
   #
@@ -60,6 +67,6 @@ private
   # @see http://unicode.org/reports/tr29/#Default_Word_Boundaries
   # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StandardTokenizerFactory
   def tokenize(text)
-    UnicodeUtils.each_word text
+    @tokens || UnicodeUtils.each_word(text)
   end
 end

@@ -1,3 +1,3 @@
 module TfIdfSimilarity
-  VERSION = "0.0.3"
+  VERSION = "0.0.4"
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: tf-idf-similarity
 version: !ruby/object:Gem::Version
-  version: 0.0.3
+  version: 0.0.4
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-09-11 00:00:00.000000000 Z
+date: 2012-10-29 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: unicode_utils
@@ -93,18 +93,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
   - - ! '>='
     - !ruby/object:Gem::Version
       version: '0'
-      segments:
-      - 0
-      hash: -4125970683092216956
 required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
   requirements:
   - - ! '>='
     - !ruby/object:Gem::Version
       version: '0'
-      segments:
-      - 0
-      hash: -4125970683092216956
 requirements: []
 rubyforge_project:
 rubygems_version: 1.8.24
@@ -112,3 +106,4 @@ signing_key:
 specification_version: 3
 summary: Calculates the similarity between texts using tf*idf
 test_files: []
+has_rdoc: