RubyGems - categorize - Versions diffs - 0.0.1 - Mend

categorize 0.0.1

Files changed (5) hide show

data/lib/categorize.rb ADDED Viewed

@@ -0,0 +1,49 @@
+require File.join(File.dirname(__FILE__), 'models', 'bag_of_words')
+require File.join(File.dirname(__FILE__), 'constants')
+module Categorize
+  MIN_WORD_LENGTH = 3
+  @bag_of_words = BagOfWords.new
+  class << self
+    #include Bow
+    # ==== Return
+    # Hash - category => results
+    # ==== Parameters
+    # documents:: a list of documents to be classified
+    def make_model(query, documents, topic_model = @bag_of_words)
+      records_to_tokens = lexicalize(documents)
+      topic_model.model(query.downcase.strip, records_to_tokens)
+    end
+    # ==== Return
+    # Hash - category => results
+    # ==== Parameters
+    # items:: the items to be classified
+    def make_model_c(strings)
+      strings.map { |s| preprocess(s) }
+      #ret = model_bow(array_of_tokens);
+      count = 0
+      ret.inject({}) do |hash, term|
+        hash[term] ||= []
+        hash[term] << count += 1
+        hash
+      end
+    end
+    private
+      def lexicalize(strings)
+        Hash[
+          (0..(strings.length - 1)).zip(strings.map { |s| preprocess(s) })
+        ]
+      end
+      def preprocess(string)
+        string.split(Constants::Words::SPLIT_REGEX).map(&:downcase).delete_if do
+          |word|
+          word.length < MIN_WORD_LENGTH ||
+            Constants::Words::COMMON.include?(word)
+        end
+      end
+  end
+end

data/lib/constants.rb ADDED Viewed

@@ -0,0 +1,541 @@
+module Constants
+  module Words
+    # only include words > 2 chars
+    ENGLISH = %w(
+      000
+      page
+      home
+      free
+      also
+      about
+      above
+      according
+      accordingly
+      across
+      after
+      afterward
+      afterwards
+      again
+      against
+      all
+      almost
+      alone
+      along
+      already
+      also
+      although
+      always
+      among
+      amongst
+      amp
+      and
+      another
+      any
+      anyhow
+      anyone
+      anything
+      anywhere
+      apr
+      are
+      aug
+      around
+      became
+      because
+      become
+      becomes
+      becoming
+      been
+      before
+      beforehand
+      began
+      behind
+      being
+      below
+      beside
+      besides
+      between
+      beyond
+      both
+      but
+      can
+      cannot
+      certain
+      com
+      could
+      days ago
+      dec
+      did
+      does
+      down
+      during
+      each
+      edu
+      either
+      else
+      elsewhere
+      enough
+      especially
+      est
+      etc
+      even
+      ever
+      every
+      everyone
+      everything
+      everywhere
+      example
+      except
+      feb
+      few
+      fewer
+      finally
+      find
+      following
+      for
+      former
+      formerly
+      from
+      further
+      furthermore
+      generally
+      get
+      given
+      had
+      has
+      have
+      having
+      hence
+      henceforth
+      her
+      here
+      hereafter
+      hereby
+      herein
+      hereupon
+      hers
+      herself
+      him
+      himself
+      his
+      hours ago
+      how
+      however
+      http
+      inc
+      include
+      included
+      includes
+      including
+      indeed
+      instead
+      into
+      its
+      itself
+      jan
+      jul
+      know
+      known
+      later
+      latterly
+      ldquo
+      llc
+      lquo
+      least
+      less
+      many
+      mar
+      may
+      maybe
+      mdash
+      meanwhile
+      might
+      miss
+      more
+      moreover
+      most
+      mostly
+      much
+      must
+      myself
+      nbsp
+      ndash
+      near
+      nearly
+      neither
+      never
+      nevertheless
+      next
+      nobody
+      non
+      none
+      nonetheless
+      nor
+      not
+      nothing
+      nov
+      now
+      nowhere
+      oct
+      off
+      often
+      once
+      one
+      only
+      onto
+      org
+      other
+      others
+      otherwise
+      our
+      ours
+      ourselves
+      out
+      over
+      overall
+      own
+      part
+      particularly
+      parts
+      per
+      perhaps
+      probably
+      quot
+      rather
+      rdquo
+      rquo
+      said
+      same
+      seem
+      seemed
+      seeming
+      seemingly
+      seems
+      sep
+      set
+      several
+      she
+      should
+      similar
+      since
+      site
+      some
+      somehow
+      someone
+      something
+      sometime
+      sometimes
+      somewhat
+      somewhere
+      still
+      such
+      than
+      that
+      the
+      their
+      them
+      themselves
+      then
+      thence
+      thenceforth
+      there
+      thereafter
+      thereby
+      therefore
+      therein
+      thereupon
+      these
+      they
+      this
+      those
+      though
+      through
+      throughout
+      thru
+      thus
+      together
+      too
+      took
+      toward
+      towards
+      two
+      under
+      unless
+      unlike
+      unlikely
+      until
+      upon
+      url
+      use
+      used
+      using
+      usually
+      various
+      very
+      via
+      want
+      was
+      way
+      well
+      were
+      what
+      whatever
+      when
+      whence
+      whenever
+      where
+      whereafter
+      whereas
+      whereby
+      wherein
+      whereupon
+      wherever
+      whether
+      which
+      while
+      whither
+      who
+      whoever
+      whole
+      whom
+      whomever
+      whose
+      why
+      will
+      with
+      within
+      without
+      would
+      www
+      yes
+      yet
+      you
+      your
+      yours
+      yourself
+      yourselves
+    )
+  SPANISH = %w(
+      acuerdo
+      adelante
+      ademas
+      adrede
+      ahi
+      ahora
+      alli
+      alrededor
+      antano
+      ante
+      antes
+      apenas
+      aproximadamente
+      aquel
+      aquella
+      aquellas
+      aquello
+      aquellos
+      aqui
+      arribaabajo
+      asi
+      aun
+      aunque
+      bajo
+      bastante
+      bien
+      breve
+      casi
+      cerca
+      claro
+      como
+      con
+      conmigo
+      contigo
+      contra
+      cual
+      cuales
+      cuando
+      cuanta
+      cuantas
+      cuanto
+      cuantos
+      debajo
+      del
+      delante
+      demasiado
+      dentro
+      deprisa
+      desde
+      despacio
+      despues
+      detras
+      dia
+      dias
+      donde
+      dos
+      durante
+      ella
+      ellas
+      ellos
+      encima
+      enfrente
+      enseguida
+      entre
+      esa
+      esas
+      ese
+      eso
+      esos
+      esta
+      estado
+      estados
+      estan
+      estar
+      estas
+      este
+      esto
+      estos
+      excepto
+      final
+      fue
+      fuera
+      fueron
+      general
+      gran
+      habia
+      habla
+      hablan
+      hace
+      hacia
+      han
+      hasta
+      hay
+      horas
+      hoy
+      incluso
+      informo
+      junto
+      lado
+      las
+      lejos
+      los
+      luego
+      mal
+      mas
+      mayor
+      medio
+      mejor
+      menos
+      menudo
+      mia
+      mias
+      mientras
+      mio
+      mios
+      mis
+      mismo
+      mucho
+      muy
+      nada
+      nadie
+      ninguna
+      nos
+      nosotras
+      nosotros
+      nuestra
+      nuestras
+      nuestro
+      nuestros
+      nueva
+      nuevo
+      nunca
+      otra
+      otros
+      pais
+      para
+      parte
+      pasado
+      peor
+      pero
+      poco
+      por
+      porque
+      pronto
+      proximo
+      puede
+      qeu
+      que
+      quien
+      quienes
+      quiza
+      quizas
+      raras
+      repente
+      salvo
+      segun
+      ser
+      sera
+      sido
+      siempre
+      sin
+      sobre
+      solamente
+      solo
+      son
+      soyos
+      supuesto
+      sus
+      suya
+      suyas
+      suyo
+      tal
+      tambien
+      tampoco
+      tarde
+      temprano
+      tiene
+      todavia
+      todo
+      todos
+      tras
+      tus
+      tuya
+      tuyas
+      tuyo
+      tuyos
+      una
+      unas
+      uno
+      unos
+      usted
+      ustedes
+      veces
+      vez
+      vosotras
+      vosotros
+      vuestra
+      vuestras
+      vuestro
+      vuestros
+      tudo
+      dise
+      dicas
+      muito
+    )
+    FRENCH = %w(
+      des
+      les
+      mais
+      pour
+    )
+    COMMON = ENGLISH | SPANISH | FRENCH
+    ASIAN_SPACE_CHARS = [
+      '\302\267',
+      '\343\200\201',
+      '\343\200\202',
+      '\343\203\273',
+      '\357\274\201'
+    ].join('|')
+    SPLIT_REGEX_STR = '[^[:word:]]|[[:punct:]]|' +
+        Constants::Words::ASIAN_SPACE_CHARS
+    SPLIT_REGEX = Regexp.new SPLIT_REGEX_STR.force_encoding('utf-8')
+  end
+end

data/lib/models/bag_of_words.rb ADDED Viewed

@@ -0,0 +1,97 @@
+require File.join(File.dirname(__FILE__), '..', 'utils', 'grams')
+class BagOfWords
+  include ::Utils::Grams
+  # DEBUG = false
+  # TODO: some gradient descent to choose this number
+  # 0 <= MIN_SUPP <= 1, we like 0.01 <= MIN_SUPP <= 0.1
+  MIN_SUPP_L = 0.07
+  MIN_SUPP_H = 0.1
+  NUM_TOP_GRAMS = 250
+  MAX_BUCKETS = 8
+  # function worst case
+  # O(2 x (#frequent_grams x #gram_collections) + #all_grams + MAX_BUCKETS x #gram_collections)
+  def model(query, records_to_tokens)
+    @gram_cover_cache = {}
+    @gram_collections, @all_grams = create_grams(query, records_to_tokens)
+    top_grams = determine_frequency_term_sets(@all_grams, query)
+    top_grams = top_grams.keys.sort do |gram_c1, gram_c2|
+      top_grams[gram_c1] <=> top_grams[gram_c2]
+    end.first(MAX_BUCKETS)
+    # below block, worst case O(MAX_BUCKETS x #gram_collections)
+    @gram_collections.inject({}) do |buckets, gram_collection|
+      max_fitness = 0
+      max_fit = nil
+      top_grams.each do |top_gram|
+        # the >= removes the 'none' possibility
+        if gram_collection.fitness[top_gram] && gram_collection.fitness[top_gram] >= max_fitness
+          max_fitness = gram_collection.fitness[top_gram]
+          max_fit = top_gram
+        end
+      end
+      buckets[max_fit] ||= []
+      buckets[max_fit] << gram_collection.content
+      buckets
+    end
+  end
+  # ==== Return
+  # Hash - fitness => [gram_collection, ...]
+  # function worst case O(2 x (#frequent_grams x #gram_collections) + #all_grams)
+  def determine_frequency_term_sets(all_grams, query)
+    # only count a result if it has non-0 words length
+    effective_length = @gram_collections.reject do |result|
+      result.grams.nil? || result.grams.empty?
+    end.length
+    min_cover_l = MIN_SUPP_L * effective_length
+    # min_cover_h = MIN_SUPP_H * effective_length
+    # for speed only look at top N grams
+    # below block, worst case O(#all_grams)
+    frequent_grams = all_grams.sort do |gram1, gram2|
+      gram2.frequency <=> gram1.frequency
+    end.first(NUM_TOP_GRAMS)
+    # below block, worst case O(#frequent_grams x #gram_collections)
+    frequent_grams = frequent_grams.delete_if do |gram|
+      !cover(gram, min_cover_l)
+    end
+    # below block, worst case O(#frequent_grams x #gram_collections)
+    @gram_collections.inject(Hash.new(0)) do |top_grams, gram_collection|
+      max_fitness = 0
+      max_fit = nil
+      frequent_grams.each do |gram|
+        fitness = gram_collection.fitness[gram.content] = (gram_collection.content_to_frequency[gram.content] || 0) / gram.frequency.to_f
+        if fitness > max_fitness
+          max_fitness = fitness
+          max_fit = gram.content
+        end
+      end
+      # puts "#{max_fit}: #{max_fitness}"# if DEBUG
+      top_grams[max_fit] += 1 if max_fit
+      top_grams
+    end
+  end
+  # function worstcase O(#gram_collections)
+  def cover(gram, min_length)
+    ((cached = @gram_cover_cache[gram]) != nil) and return cached
+    count = 0
+    @gram_collections.each do |gram_collection|
+      frequency = gram_collection.content_to_frequency[gram.content]
+      if !frequency.nil? && frequency > 0
+        count += 1
+        return @gram_cover_cache[gram] = true if count >= min_length
+      end
+    end
+    @gram_cover_cache[gram] = false
+  end
+end

data/lib/utils/grams.rb ADDED Viewed

@@ -0,0 +1,45 @@
+require File.join(File.dirname(__FILE__), 'gram_collection')
+require File.join(File.dirname(__FILE__), 'gram_node')
+module Utils
+  module Grams
+    def create_grams(query, records_to_words)
+      all_grams = []
+      @query = query
+      @query_terms = query.split.map(&:downcase).map(&:strip)
+      @query_alt = "#{@query_terms[1..-1]} #{@query_terms[0]}"
+      invalid = Proc.new do |gram, *args|
+        # remove [[gram]] if == [[query]]
+        gram == @query || gram == @query_alt || @query_terms.include?(gram)
+      end
+      gram_collections = records_to_words.map do |record, words|
+        gram_collection = GramCollection.new(record, words, invalid)
+        all_grams += gram_collection.grams
+        gram_collection
+      end
+      return gram_collections, make_grams_unique(all_grams)
+    end
+    def check_plurals(frequent_grams)
+      # if exists [[gram]] and [[gram]]s then remove [[gram]]s
+      frequent_grams_contents = frequent_grams.map(&:content)
+      frequent_grams.delete_if do |gram|
+        gram.content[-1] == 's' and
+          frequent_grams_contents.include?(gram.content[0...-1])
+      end
+    end
+    def make_grams_unique(grams)
+      grams.inject({}) do |hash, gram|
+        if hash[gram.content]
+          hash[gram.content].frequency += gram.frequency
+        else
+          hash[gram.content] = gram
+        end
+        hash
+      end.values
+    end
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,49 @@
+--- !ruby/object:Gem::Specification
+name: categorize
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+  prerelease:
+platform: ruby
+authors:
+- Peter Lubell-Doughtie
+- Helioid Inc.
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2012-06-28 00:00:00.000000000 Z
+dependencies: []
+description: Text categorization library
+email: peter@helioid.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/categorize.rb
+- lib/constants.rb
+- lib/models/bag_of_words.rb
+- lib/utils/grams.rb
+homepage: http://www.helioid.com/
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.25
+signing_key:
+specification_version: 3
+summary: Text categorization library
+test_files: []