RubyGems - logankoester-classifier - Versions diffs - 1.4.3 - Mend

logankoester-classifier 1.4.3

Files changed (26) hide show

data/LICENSE +429 -0
data/Manifest +19 -0
data/README.rdoc +124 -0
data/Rakefile +21 -0
data/VERSION.yml +5 -0
data/lib/classifier.rb +31 -0
data/lib/classifier/base.rb +65 -0
data/lib/classifier/bayes.rb +145 -0
data/lib/classifier/extensions/vector.rb +100 -0
data/lib/classifier/extensions/vector_serialize.rb +20 -0
data/lib/classifier/lsi.rb +348 -0
data/lib/classifier/lsi/content_node.rb +73 -0
data/lib/classifier/lsi/summary.rb +31 -0
data/lib/classifier/lsi/word_list.rb +36 -0
data/lib/classifier/stopwords.rb +42 -0
data/lib/classifier/stopwords/en +82 -0
data/lib/classifier/stopwords/es +339 -0
data/lib/classifier/stopwords/ru +161 -0
data/lib/init.rb +1 -0
data/tasks/test.rake +6 -0
data/test/base_test.rb +17 -0
data/test/bayes/bayesian_test.rb +68 -0
data/test/lsi/lsi_test.rb +167 -0
data/test/stopwords_test.rb +38 -0
data/test/test_helper.rb +4 -0
metadata +127 -0

data/lib/classifier/lsi/content_node.rb ADDED Viewed

@@ -0,0 +1,73 @@
+# Author::    David Fayram  (mailto:dfayram@lensmen.net)
+# Copyright:: Copyright (c) 2005 David Fayram II
+# License::   LGPL
+module Classifier
+# This is an internal data structure class for the LSI node. Save for
+# raw_vector_with, it should be fairly straightforward to understand.
+# You should never have to use it directly.
+  class ContentNode
+    attr_accessor :raw_vector, :raw_norm,
+                  :lsi_vector, :lsi_norm,
+                  :categories
+    attr_reader :word_hash
+    # If text_proc is not specified, the source will be duck-typed
+    # via source.to_s
+    def initialize( word_hash, *categories )
+      @categories = categories || []
+      @word_hash = word_hash
+    end
+    # Use this to fetch the appropriate search vector.
+    def search_vector
+      @lsi_vector || @raw_vector
+    end
+    # Use this to fetch the appropriate search vector in normalized form.
+    def search_norm
+      @lsi_norm || @raw_norm
+    end
+    # Creates the raw vector out of word_hash using word_list as the
+    # key for mapping the vector space.
+    def raw_vector_with( word_list )
+      if $GSL
+         vec = GSL::Vector.alloc(word_list.size)
+      else
+         vec = Array.new(word_list.size, 0)
+      end
+      @word_hash.each_key do |word|
+        vec[word_list[word]] = @word_hash[word] if word_list[word]
+      end
+      # Perform the scaling transform
+      total_words = vec.sum.to_f
+      # Perform first-order association transform if this vector has more
+      # than one word in it.
+      if total_words > 1.0
+        weighted_total = 0.0
+        vec.each do |term|
+          if ( term > 0 )
+            weighted_total += (( term / total_words ) * Math.log( term / total_words ))
+          end
+        end
+        weighted_total = -1.0 if weighted_total.zero? # if no word in list is known
+        vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
+      end
+      if $GSL
+         @raw_norm   = vec.normalize
+         @raw_vector = vec
+      else
+         @raw_norm   = Vector[*vec].normalize
+         @raw_vector = Vector[*vec]
+      end
+    end
+  end
+end

data/lib/classifier/lsi/summary.rb ADDED Viewed

@@ -0,0 +1,31 @@
+# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
+# Copyright:: Copyright (c) 2005 Lucas Carlson
+# License::   LGPL
+class String
+   def summary( count=10, separator=" [...] " )
+      perform_lsi split_sentences, count, separator
+   end
+   def paragraph_summary( count=1, separator=" [...] " )
+      perform_lsi split_paragraphs, count, separator
+   end
+   def split_sentences
+      split /(\.|\!|\?)/ # TODO: make this less primitive
+   end
+   def split_paragraphs
+      split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
+   end
+   private
+   def perform_lsi(chunks, count, separator)
+      lsi = Classifier::LSI.new :auto_rebuild => false
+      chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
+      lsi.build_index
+      summaries = lsi.highest_relative_content count
+      return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
+   end
+end

data/lib/classifier/lsi/word_list.rb ADDED Viewed

@@ -0,0 +1,36 @@
+# Author::    David Fayram  (mailto:dfayram@lensmen.net)
+# Copyright:: Copyright (c) 2005 David Fayram II
+# License::   LGPL
+module Classifier
+  # This class keeps a word => index mapping. It is used to map stemmed words
+  # to dimensions of a vector.
+  class WordList
+    def initialize
+      @location_table = Hash.new
+    end
+    # Adds a word (if it is new) and assigns it a unique dimension.
+    def add_word(word)
+      term = word
+      @location_table[term] = @location_table.size unless @location_table[term]
+    end
+    # Returns the dimension of the word or nil if the word is not in the space.
+    def [](lookup)
+      term = lookup
+      @location_table[term]
+    end
+    def word_for_index(ind)
+      @location_table.invert[ind]
+    end
+    # Returns the number of words mapped.
+    def size
+      @location_table.size
+    end
+  end
+end

data/lib/classifier/stopwords.rb ADDED Viewed

@@ -0,0 +1,42 @@
+module Classifier
+  module StopWords
+    def self.for(language, lang_dir=nil)
+      unless STOP_WORDS.has_key?(language)
+        STOP_WORDS[language] = load_stopwords(language, lang_dir) || []
+      end
+      STOP_WORDS[language]
+    end
+    def self.reset
+      STOP_WORDS.clear
+    end
+    protected
+      def self.load_stopwords(language, lang_dir)
+        default_dir = File.join(File.dirname(__FILE__), 'stopwords')
+        load_file(language, lang_dir) || load_file(language, default_dir) || []
+      end
+      def self.load_file(language, lang_dir)
+        return if lang_dir.nil?
+        lang_file = File.join(lang_dir, language)
+        if File.exist?(lang_file)
+          data = []
+          File.open(lang_file, 'r:utf-8') do |f|
+            f.each_line do |line|
+              line = line.gsub(/#.*/, '').strip
+              data << line unless line.empty?
+            end
+          end
+          data unless data.empty?
+        end
+      end
+    STOP_WORDS = {}
+  end
+end

data/lib/classifier/stopwords/en ADDED Viewed

@@ -0,0 +1,82 @@
+# English stopwords
+# Extracted from the gem's source code
+a
+again
+all
+along
+are
+also
+an
+and
+as
+at
+but
+by
+came
+can
+cant
+couldnt
+did
+didn
+didnt
+do
+doesnt
+dont
+ever
+first
+from
+have
+her
+here
+him
+how
+i
+if
+in
+into
+is
+isnt
+it
+itll
+just
+last
+least
+like
+most
+my
+new
+no
+not
+now
+of
+on
+or
+should
+sinc
+so
+some
+th
+than
+this
+that
+the
+their
+then
+those
+to
+told
+too
+true
+try
+until
+url
+us
+were
+when
+whether
+while
+with
+within
+yes
+you
+youll

data/lib/classifier/stopwords/es ADDED Viewed

@@ -0,0 +1,339 @@
+# Spanish stopwords
+# http://snowball.tartarus.org/algorithms/spanish/stop.txt
+de             #  from, of
+la             #  the, her
+que            #  who, that
+el             #  the
+en             #  in
+y              #  and
+a              #  to
+los            #  the, them
+del            #  de + el
+se             #  himself, from him etc
+las            #  the, them
+por            #  for, by, etc
+un             #  a
+para           #  for
+con            #  with
+no             #  no
+una            #  a
+su             #  his, her
+al             #  a + el
+es             # from SER
+lo             #  him
+como           #  how
+más            #  more
+pero           #  pero
+sus            #  su plural
+le             #  to him, her
+ya             #  already
+o              #  or
+fue            # from SER
+este           #  this
+ha             # from HABER
+sí             #  himself etc
+porque         #  because
+esta           #  this
+son            # from SER
+entre          #  between
+está           # from ESTAR
+cuando         #  when
+muy            #  very
+sin            #  without
+sobre          #  on
+ser            # from SER
+tiene          # from TENER
+también        #  also
+me             #  me
+hasta          #  until
+hay            #  there is/are
+donde          #  where
+han            # from HABER
+quien          #  whom, that
+están          # from ESTAR
+estado         # from ESTAR
+desde          #  from
+todo           #  all
+nos            #  us
+durante        #  during
+estados        # from ESTAR
+todos          #  all
+uno            #  a
+les            #  to them
+ni             #  nor
+contra         #  against
+otros          #  other
+fueron         # from SER
+ese            #  that
+eso            #  that
+había          # from HABER
+ante           #  before
+ellos          #  they
+e              #  and (variant of y)
+esto           #  this
+mí             #  me
+antes          #  before
+algunos        #  some
+qué            #  what?
+unos           #  a
+yo             #  I
+otro           #  other
+otras          #  other
+otra           #  other
+él             #  he
+tanto          #  so much, many
+esa            #  that
+estos          #  these
+mucho          #  much, many
+quienes        #  who
+nada           #  nothing
+muchos         #  many
+cual           #  who
+sea            # from SER
+poco           #  few
+ella           #  she
+estar          #  to be
+haber          # from HABER
+estas          #  these
+estaba         # from ESTAR
+estamos        # from ESTAR
+algunas        #  some
+algo           #  something
+nosotros       #  we
+      # other forms
+mi             #  me
+mis            #  mi plural
+tú             #  thou
+te             #  thee
+ti             #  thee
+tu             #  thy
+tus            #  tu plural
+ellas          #  they
+nosotras       #  we
+vosotros       #  you
+vosotras       #  you
+os             #  you
+mío            #  mine
+mía            #
+míos           #
+mías           #
+tuyo           #  thine
+tuya           #
+tuyos          #
+tuyas          #
+suyo           #  his, hers, theirs
+suya           #
+suyos          #
+suyas          #
+nuestro        #  ours
+nuestra        #
+nuestros       #
+nuestras       #
+vuestro        #  yours
+vuestra        #
+vuestros       #
+vuestras       #
+esos           #  those
+esas           #  those
+               # forms of estar, to be (not including the infinitive):
+estoy
+estás
+está
+estamos
+estáis
+están
+esté
+estés
+estemos
+estéis
+estén
+estaré
+estarás
+estará
+estaremos
+estaréis
+estarán
+estaría
+estarías
+estaríamos
+estaríais
+estarían
+estaba
+estabas
+estábamos
+estabais
+estaban
+estuve
+estuviste
+estuvo
+estuvimos
+estuvisteis
+estuvieron
+estuviera
+estuvieras
+estuviéramos
+estuvierais
+estuvieran
+estuviese
+estuvieses
+estuviésemos
+estuvieseis
+estuviesen
+estando
+estado
+estada
+estados
+estadas
+estad
+               # forms of haber, to have (not including the infinitive):
+he
+has
+ha
+hemos
+habéis
+han
+haya
+hayas
+hayamos
+hayáis
+hayan
+habré
+habrás
+habrá
+habremos
+habréis
+habrán
+habría
+habrías
+habríamos
+habríais
+habrían
+había
+habías
+habíamos
+habíais
+habían
+hube
+hubiste
+hubo
+hubimos
+hubisteis
+hubieron
+hubiera
+hubieras
+hubiéramos
+hubierais
+hubieran
+hubiese
+hubieses
+hubiésemos
+hubieseis
+hubiesen
+habiendo
+habido
+habida
+habidos
+habidas
+               # forms of ser, to be (not including the infinitive):
+soy
+eres
+es
+somos
+sois
+son
+sea
+seas
+seamos
+seáis
+sean
+seré
+serás
+será
+seremos
+seréis
+serán
+sería
+serías
+seríamos
+seríais
+serían
+era
+eras
+éramos
+erais
+eran
+fui
+fuiste
+fue
+fuimos
+fuisteis
+fueron
+fuera
+fueras
+fuéramos
+fuerais
+fueran
+fuese
+fueses
+fuésemos
+fueseis
+fuesen
+siendo
+sido
+  #  sed also means 'thirst'
+               # forms of tener, to have (not including the infinitive):
+tengo
+tienes
+tiene
+tenemos
+tenéis
+tienen
+tenga
+tengas
+tengamos
+tengáis
+tengan
+tendré
+tendrás
+tendrá
+tendremos
+tendréis
+tendrán
+tendría
+tendrías
+tendríamos
+tendríais
+tendrían
+tenía
+tenías
+teníamos
+teníais
+tenían
+tuve
+tuviste
+tuvo
+tuvimos
+tuvisteis
+tuvieron
+tuviera
+tuvieras
+tuviéramos
+tuvierais
+tuvieran
+tuviese
+tuvieses
+tuviésemos
+tuvieseis
+tuviesen
+teniendo
+tenido
+tenida
+tenidos
+tenidas
+tened