rsemantic 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +6 -13
- data/lib/{semantic.rb → rsemantic.rb} +13 -14
- data/lib/{semantic → rsemantic}/compare.rb +1 -1
- data/lib/{semantic → rsemantic}/corpus.rb +4 -4
- data/lib/{semantic → rsemantic}/document.rb +1 -1
- data/lib/{semantic → rsemantic}/matrix_transformer.rb +6 -6
- data/lib/{semantic → rsemantic}/parser.rb +1 -1
- data/lib/{semantic → rsemantic}/search.rb +21 -4
- data/lib/{semantic → rsemantic}/search_result.rb +1 -1
- data/lib/rsemantic/transform.rb +1 -0
- data/lib/{semantic → rsemantic}/transform/lsa_transform.rb +6 -3
- data/lib/{semantic → rsemantic}/transform/tf_idf_transform.rb +1 -1
- data/lib/rsemantic/vector_space.rb +1 -0
- data/lib/{semantic → rsemantic}/vector_space/builder.rb +6 -2
- data/lib/{semantic → rsemantic}/vector_space/model.rb +1 -1
- data/lib/{semantic → rsemantic}/version.rb +3 -3
- metadata +33 -39
- data/lib/semantic/transform.rb +0 -1
- data/lib/semantic/vector_space.rb +0 -1
    
        checksums.yaml
    ADDED
    
    | @@ -0,0 +1,7 @@ | |
| 1 | 
            +
            ---
         | 
| 2 | 
            +
            SHA1:
         | 
| 3 | 
            +
              metadata.gz: b772bdf3866ef3155cb94364a88c1b2268c2ffe8
         | 
| 4 | 
            +
              data.tar.gz: b9e832658a877a1b066ab77388780041fe74e9a0
         | 
| 5 | 
            +
            SHA512:
         | 
| 6 | 
            +
              metadata.gz: 30ed38b3a259d1dc5fd8398cb5747d6c396d270d37c9bb2b969c7a39be69a9dca70a4b3a1726df85b75782cf575c6752e0a173e10d353321e59cd4d8ff15a4ed
         | 
| 7 | 
            +
              data.tar.gz: ffd419f02750472c80094c75eea45eea92e0c1bee9ca445cd7b8c79cf8f40bcbf7c6a9b698437c8465e7e72ada8f4fba8d4cc4d0057fe76cafb214f256b58bf6
         | 
    
        data/README.md
    CHANGED
    
    | @@ -12,31 +12,24 @@ Documentation: http://github.com/josephwilk/rsemantic/wikis/home | |
| 12 12 | 
             
            ## Requirements:
         | 
| 13 13 |  | 
| 14 14 | 
             
            * GSL - http://www.gnu.org/software/gsl
         | 
| 15 | 
            -
            * stemmer - http://rubyforge.org/projects/stemmer/
         | 
| 16 15 |  | 
| 17 16 | 
             
            ## INSTALL:
         | 
| 18 17 |  | 
| 19 | 
            -
             | 
| 20 | 
            -
            With homebrew try this:
         | 
| 18 | 
            +
            Rsemantic requires GSL. With homebrew try this:
         | 
| 21 19 |  | 
| 22 | 
            -
             | 
| 23 | 
            -
             | 
| 24 | 
            -
             | 
| 25 | 
            -
             | 
| 26 | 
            -
            brew tap homebrew/versions
         | 
| 27 | 
            -
            brew install gsl114
         | 
| 28 | 
            -
            bundle install
         | 
| 29 | 
            -
            </code></pre>
         | 
| 20 | 
            +
            ```
         | 
| 21 | 
            +
            brew install gsl
         | 
| 22 | 
            +
            ```
         | 
| 30 23 |  | 
| 31 24 | 
             
            ## Contributors
         | 
| 32 | 
            -
            * @josephwilk
         | 
| 25 | 
            +
            * [@josephwilk](http://blog.josephwilk.net)
         | 
| 33 26 | 
             
            * @dominikhonnef
         | 
| 34 27 |  | 
| 35 28 | 
             
            ## LICENSE
         | 
| 36 29 |  | 
| 37 30 | 
             
            (The MIT License)
         | 
| 38 31 |  | 
| 39 | 
            -
            Copyright (c) 2008- | 
| 32 | 
            +
            Copyright (c) 2008-2014 Joseph Wilk
         | 
| 40 33 |  | 
| 41 34 | 
             
            Permission is hereby granted, free of charge, to any person obtaining
         | 
| 42 35 | 
             
            a copy of this software and associated documentation files (the
         | 
| @@ -1,25 +1,24 @@ | |
| 1 1 | 
             
            $:.unshift(File.dirname(__FILE__)) unless
         | 
| 2 2 | 
             
            $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
         | 
| 3 3 |  | 
| 4 | 
            -
            require " | 
| 5 | 
            -
            require " | 
| 6 | 
            -
            require " | 
| 7 | 
            -
            require " | 
| 8 | 
            -
            require " | 
| 9 | 
            -
            require " | 
| 10 | 
            -
            require " | 
| 11 | 
            -
             | 
| 12 | 
            -
            require " | 
| 13 | 
            -
            require " | 
| 14 | 
            -
            require " | 
| 15 | 
            -
             | 
| 16 | 
            -
            require 'rubygems'
         | 
| 4 | 
            +
            require "rsemantic/vector_space"
         | 
| 5 | 
            +
            require "rsemantic/compare"
         | 
| 6 | 
            +
            require "rsemantic/parser"
         | 
| 7 | 
            +
            require "rsemantic/matrix_transformer"
         | 
| 8 | 
            +
            require "rsemantic/search"
         | 
| 9 | 
            +
            require "rsemantic/transform"
         | 
| 10 | 
            +
            require "rsemantic/version"
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            require "rsemantic/corpus"
         | 
| 13 | 
            +
            require "rsemantic/document"
         | 
| 14 | 
            +
            require "rsemantic/search_result"
         | 
| 15 | 
            +
             | 
| 17 16 | 
             
            require 'gsl'
         | 
| 18 17 |  | 
| 19 18 | 
             
            require 'stemmer'
         | 
| 20 19 | 
             
            require 'logger'
         | 
| 21 20 |  | 
| 22 | 
            -
            module  | 
| 21 | 
            +
            module RSemantic
         | 
| 23 22 |  | 
| 24 23 | 
             
              class << self
         | 
| 25 24 | 
             
                attr_writer :logger
         | 
| @@ -1,4 +1,4 @@ | |
| 1 | 
            -
            module  | 
| 1 | 
            +
            module RSemantic
         | 
| 2 2 | 
             
              class Corpus
         | 
| 3 3 | 
             
                # @return [Array<Document>]
         | 
| 4 4 | 
             
                attr_reader :documents
         | 
| @@ -30,7 +30,7 @@ module Semantic | |
| 30 30 | 
             
                #
         | 
| 31 31 | 
             
                # @return [void]
         | 
| 32 32 | 
             
                def build_index
         | 
| 33 | 
            -
                  @search =  | 
| 33 | 
            +
                  @search = RSemantic::Search.new(@documents.map(&:text), @options)
         | 
| 34 34 | 
             
                end
         | 
| 35 35 |  | 
| 36 36 | 
             
                def search(*words)
         | 
| @@ -38,14 +38,14 @@ module Semantic | |
| 38 38 | 
             
                  results = @search.search(words)
         | 
| 39 39 | 
             
                  results.map.with_index { |result, index|
         | 
| 40 40 | 
             
                    document = @documents[index]
         | 
| 41 | 
            -
                     | 
| 41 | 
            +
                    RSemantic::SearchResult.new(document, result)
         | 
| 42 42 | 
             
                  }.sort
         | 
| 43 43 | 
             
                end
         | 
| 44 44 |  | 
| 45 45 | 
             
                def find_related_document(document)
         | 
| 46 46 | 
             
                  @search.related(@documents.index(document)).map.with_index { |result, index|
         | 
| 47 47 | 
             
                    document = @documents[index]
         | 
| 48 | 
            -
                     | 
| 48 | 
            +
                    RSemantic::SearchResult.new(document, result)
         | 
| 49 49 | 
             
                  }.sort
         | 
| 50 50 | 
             
                end
         | 
| 51 51 |  | 
| @@ -1,4 +1,4 @@ | |
| 1 | 
            -
            module  | 
| 1 | 
            +
            module RSemantic
         | 
| 2 2 | 
             
              class MatrixTransformer
         | 
| 3 3 |  | 
| 4 4 | 
             
                def initialize(transforms)
         | 
| @@ -8,13 +8,13 @@ module Semantic | |
| 8 8 | 
             
                def apply_transforms(vector_space_model)
         | 
| 9 9 | 
             
                  @transforms.each do |transform|
         | 
| 10 10 | 
             
                    begin
         | 
| 11 | 
            -
                      transform_class =  | 
| 12 | 
            -
                       | 
| 11 | 
            +
                      transform_class = RSemantic::Transform.const_get(transform)
         | 
| 12 | 
            +
                      RSemantic.logger.info("Applying #{transform} transform")
         | 
| 13 13 | 
             
                      transform_class.transform!(vector_space_model.matrix)
         | 
| 14 | 
            -
                       | 
| 14 | 
            +
                      RSemantic.logger.info(vector_space_model)
         | 
| 15 15 | 
             
                    rescue => e
         | 
| 16 | 
            -
                       | 
| 17 | 
            -
                       | 
| 16 | 
            +
                      RSemantic.logger.error("Error: Cannot perform transform: #{transform}")
         | 
| 17 | 
            +
                      RSemantic.logger.error(e)
         | 
| 18 18 | 
             
                    end
         | 
| 19 19 | 
             
                  end
         | 
| 20 20 | 
             
                  vector_space_model
         | 
| @@ -1,5 +1,6 @@ | |
| 1 | 
            -
            module  | 
| 1 | 
            +
            module RSemantic
         | 
| 2 2 | 
             
              class Search
         | 
| 3 | 
            +
                attr_reader :builder
         | 
| 3 4 |  | 
| 4 5 | 
             
                def initialize(documents, options = {})
         | 
| 5 6 | 
             
                  options = {
         | 
| @@ -8,15 +9,19 @@ module Semantic | |
| 8 9 | 
             
                    :filter_stop_words => true,
         | 
| 9 10 | 
             
                    :stem_words => true,
         | 
| 10 11 | 
             
                  }.merge(options)
         | 
| 11 | 
            -
                   | 
| 12 | 
            +
                  RSemantic.logger.level = options[:verbose] ? Logger::INFO : Logger::ERROR
         | 
| 12 13 |  | 
| 13 14 |  | 
| 14 | 
            -
                  @builder = VectorSpace::Builder.new( | 
| 15 | 
            +
                  @builder = VectorSpace::Builder.new(
         | 
| 16 | 
            +
                    :filter_stop_words => options[:filter_stop_words],
         | 
| 17 | 
            +
                    :stem_words => options[:stem_words],
         | 
| 18 | 
            +
                    :locale => options[:locale]
         | 
| 19 | 
            +
                  )
         | 
| 15 20 | 
             
                  @matrix_transformer = MatrixTransformer.new(options[:transforms])
         | 
| 16 21 |  | 
| 17 22 | 
             
                  @vector_space_model = @builder.build_document_matrix(documents)
         | 
| 18 23 |  | 
| 19 | 
            -
                   | 
| 24 | 
            +
                  RSemantic.logger.info(@vector_space_model)
         | 
| 20 25 |  | 
| 21 26 | 
             
                  @vector_space_model = @matrix_transformer.apply_transforms(@vector_space_model)
         | 
| 22 27 | 
             
                end
         | 
| @@ -37,5 +42,17 @@ module Semantic | |
| 37 42 | 
             
                  end
         | 
| 38 43 | 
             
                  ratings
         | 
| 39 44 | 
             
                end
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                protected
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                def marshal_dump
         | 
| 49 | 
            +
                  [@builder, @matrix_transformer, @vector_space_model.to_a]
         | 
| 50 | 
            +
                end
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                def marshal_load(array)
         | 
| 53 | 
            +
                  @builder = array.shift
         | 
| 54 | 
            +
                  @matrix_transformer = array.shift
         | 
| 55 | 
            +
                  @vector_space_model = GSL::Matrix.alloc(*array.shift)
         | 
| 56 | 
            +
                end
         | 
| 40 57 | 
             
              end
         | 
| 41 58 | 
             
            end
         | 
| @@ -0,0 +1 @@ | |
| 1 | 
            +
            %w{tf_idf lsa}.each{|f| require "rsemantic/transform/#{f}_transform.rb"}
         | 
| @@ -1,4 +1,4 @@ | |
| 1 | 
            -
            module  | 
| 1 | 
            +
            module RSemantic
         | 
| 2 2 | 
             
              module Transform
         | 
| 3 3 | 
             
                class LSA
         | 
| 4 4 |  | 
| @@ -8,11 +8,14 @@ module Semantic | |
| 8 8 | 
             
                      # TODO configurable rank
         | 
| 9 9 | 
             
                      columns = matrix.size2
         | 
| 10 10 |  | 
| 11 | 
            -
                       | 
| 11 | 
            +
                      # if M < N perform SVD on transponsed matrix
         | 
| 12 | 
            +
                      matrix.size1 < matrix.size2 ? (u, v, sigma = matrix.transpose.SV_decomp_mod) : (u, v, sigma = matrix.SV_decomp_mod)
         | 
| 13 | 
            +
             | 
| 12 14 | 
             
                      reduce_dimensions!(sigma, rank)
         | 
| 13 15 | 
             
                      sigma = GSL::Matrix.diagonal(sigma)
         | 
| 14 16 |  | 
| 15 | 
            -
                       | 
| 17 | 
            +
                      # if M < N return transposed result
         | 
| 18 | 
            +
                      matrix.size1 < matrix.size2 ? GSL::Matrix.swap(matrix, (u * sigma * v.transpose).transpose) : GSL::Matrix.swap(matrix, u * sigma * v.transpose)
         | 
| 16 19 | 
             
                    end
         | 
| 17 20 |  | 
| 18 21 | 
             
                    private
         | 
| @@ -0,0 +1 @@ | |
| 1 | 
            +
            %w{model builder}.each{|f| require "rsemantic/vector_space/#{f}"}
         | 
| @@ -1,12 +1,16 @@ | |
| 1 | 
            -
            module  | 
| 1 | 
            +
            module RSemantic
         | 
| 2 2 | 
             
              module VectorSpace
         | 
| 3 3 | 
             
                # A algebraic model for representing text documents as vectors of identifiers.
         | 
| 4 4 | 
             
                # A document is represented as a vector. Each dimension of the vector corresponds to a
         | 
| 5 5 | 
             
                # separate term. If a term occurs in the document, then the value in the vector is non-zero.
         | 
| 6 6 | 
             
                class Builder
         | 
| 7 | 
            +
                  attr_reader :parsed_document_cache
         | 
| 7 8 |  | 
| 8 9 | 
             
                  def initialize(options = {})
         | 
| 9 | 
            -
                    @parser = Parser.new( | 
| 10 | 
            +
                    @parser = Parser.new(
         | 
| 11 | 
            +
                      :filter_stop_words => options[:filter_stop_words],
         | 
| 12 | 
            +
                      :locale => options[:locale]
         | 
| 13 | 
            +
                    )
         | 
| 10 14 | 
             
                    @parsed_document_cache = []
         | 
| 11 15 | 
             
                  end
         | 
| 12 16 |  | 
    
        metadata
    CHANGED
    
    | @@ -1,48 +1,43 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: rsemantic
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0. | 
| 5 | 
            -
              prerelease: 
         | 
| 4 | 
            +
              version: 0.3.0
         | 
| 6 5 | 
             
            platform: ruby
         | 
| 7 6 | 
             
            authors:
         | 
| 8 7 | 
             
            - Joseph Wilk
         | 
| 9 8 | 
             
            autorequire: 
         | 
| 10 9 | 
             
            bindir: bin
         | 
| 11 10 | 
             
            cert_chain: []
         | 
| 12 | 
            -
            date:  | 
| 11 | 
            +
            date: 2014-03-04 00:00:00.000000000 Z
         | 
| 13 12 | 
             
            dependencies:
         | 
| 14 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 15 14 | 
             
              name: gsl
         | 
| 16 15 | 
             
              requirement: !ruby/object:Gem::Requirement
         | 
| 17 | 
            -
                none: false
         | 
| 18 16 | 
             
                requirements:
         | 
| 19 | 
            -
                - - ' | 
| 17 | 
            +
                - - '>='
         | 
| 20 18 | 
             
                  - !ruby/object:Gem::Version
         | 
| 21 | 
            -
                    version:  | 
| 19 | 
            +
                    version: '0'
         | 
| 22 20 | 
             
              type: :runtime
         | 
| 23 21 | 
             
              prerelease: false
         | 
| 24 22 | 
             
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 25 | 
            -
                none: false
         | 
| 26 23 | 
             
                requirements:
         | 
| 27 | 
            -
                - - ' | 
| 24 | 
            +
                - - '>='
         | 
| 28 25 | 
             
                  - !ruby/object:Gem::Version
         | 
| 29 | 
            -
                    version:  | 
| 26 | 
            +
                    version: '0'
         | 
| 30 27 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 31 28 | 
             
              name: fast-stemmer
         | 
| 32 29 | 
             
              requirement: !ruby/object:Gem::Requirement
         | 
| 33 | 
            -
                none: false
         | 
| 34 30 | 
             
                requirements:
         | 
| 35 | 
            -
                - -  | 
| 31 | 
            +
                - - '>='
         | 
| 36 32 | 
             
                  - !ruby/object:Gem::Version
         | 
| 37 | 
            -
                    version:  | 
| 33 | 
            +
                    version: '0'
         | 
| 38 34 | 
             
              type: :runtime
         | 
| 39 35 | 
             
              prerelease: false
         | 
| 40 36 | 
             
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 41 | 
            -
                none: false
         | 
| 42 37 | 
             
                requirements:
         | 
| 43 | 
            -
                - -  | 
| 38 | 
            +
                - - '>='
         | 
| 44 39 | 
             
                  - !ruby/object:Gem::Version
         | 
| 45 | 
            -
                    version:  | 
| 40 | 
            +
                    version: '0'
         | 
| 46 41 | 
             
            description: A document vector search with flexible matrix transforms. Currently supports
         | 
| 47 42 | 
             
              Latent semantic analysis and Term frequency - inverse document frequency
         | 
| 48 43 | 
             
            email:
         | 
| @@ -54,21 +49,24 @@ extra_rdoc_files: | |
| 54 49 | 
             
            - README.md
         | 
| 55 50 | 
             
            - TODO.txt
         | 
| 56 51 | 
             
            files:
         | 
| 57 | 
            -
            -  | 
| 58 | 
            -
            -  | 
| 59 | 
            -
            -  | 
| 60 | 
            -
            - lib/ | 
| 61 | 
            -
            - lib/ | 
| 62 | 
            -
            - lib/ | 
| 63 | 
            -
            - lib/ | 
| 64 | 
            -
            - lib/ | 
| 65 | 
            -
            - lib/ | 
| 66 | 
            -
            - lib/ | 
| 67 | 
            -
            - lib/ | 
| 68 | 
            -
            - lib/ | 
| 69 | 
            -
            - lib/ | 
| 70 | 
            -
            - lib/ | 
| 71 | 
            -
            - lib/ | 
| 52 | 
            +
            - History.txt
         | 
| 53 | 
            +
            - README.md
         | 
| 54 | 
            +
            - TODO.txt
         | 
| 55 | 
            +
            - lib/rsemantic.rb
         | 
| 56 | 
            +
            - lib/rsemantic/compare.rb
         | 
| 57 | 
            +
            - lib/rsemantic/corpus.rb
         | 
| 58 | 
            +
            - lib/rsemantic/document.rb
         | 
| 59 | 
            +
            - lib/rsemantic/matrix_transformer.rb
         | 
| 60 | 
            +
            - lib/rsemantic/parser.rb
         | 
| 61 | 
            +
            - lib/rsemantic/search.rb
         | 
| 62 | 
            +
            - lib/rsemantic/search_result.rb
         | 
| 63 | 
            +
            - lib/rsemantic/transform.rb
         | 
| 64 | 
            +
            - lib/rsemantic/transform/lsa_transform.rb
         | 
| 65 | 
            +
            - lib/rsemantic/transform/tf_idf_transform.rb
         | 
| 66 | 
            +
            - lib/rsemantic/vector_space.rb
         | 
| 67 | 
            +
            - lib/rsemantic/vector_space/builder.rb
         | 
| 68 | 
            +
            - lib/rsemantic/vector_space/model.rb
         | 
| 69 | 
            +
            - lib/rsemantic/version.rb
         | 
| 72 70 | 
             
            - lib/tasks/rspec.rake
         | 
| 73 71 | 
             
            - resources/ar.stop
         | 
| 74 72 | 
             
            - resources/ca.stop
         | 
| @@ -91,34 +89,30 @@ files: | |
| 91 89 | 
             
            - resources/ru.stop
         | 
| 92 90 | 
             
            - resources/sv.stop
         | 
| 93 91 | 
             
            - resources/tr.stop
         | 
| 94 | 
            -
            - History.txt
         | 
| 95 | 
            -
            - README.md
         | 
| 96 | 
            -
            - TODO.txt
         | 
| 97 92 | 
             
            homepage: http://github.com/josephwilk/rsemantic
         | 
| 98 93 | 
             
            licenses:
         | 
| 99 94 | 
             
            - MIT
         | 
| 95 | 
            +
            metadata: {}
         | 
| 100 96 | 
             
            post_install_message: 
         | 
| 101 97 | 
             
            rdoc_options:
         | 
| 102 98 | 
             
            - --charset=UTF-8
         | 
| 103 99 | 
             
            require_paths:
         | 
| 104 100 | 
             
            - lib
         | 
| 105 101 | 
             
            required_ruby_version: !ruby/object:Gem::Requirement
         | 
| 106 | 
            -
              none: false
         | 
| 107 102 | 
             
              requirements:
         | 
| 108 | 
            -
              - -  | 
| 103 | 
            +
              - - '>='
         | 
| 109 104 | 
             
                - !ruby/object:Gem::Version
         | 
| 110 105 | 
             
                  version: '0'
         | 
| 111 106 | 
             
            required_rubygems_version: !ruby/object:Gem::Requirement
         | 
| 112 | 
            -
              none: false
         | 
| 113 107 | 
             
              requirements:
         | 
| 114 | 
            -
              - -  | 
| 108 | 
            +
              - - '>='
         | 
| 115 109 | 
             
                - !ruby/object:Gem::Version
         | 
| 116 110 | 
             
                  version: '0'
         | 
| 117 111 | 
             
            requirements: []
         | 
| 118 112 | 
             
            rubyforge_project: 
         | 
| 119 | 
            -
            rubygems_version:  | 
| 113 | 
            +
            rubygems_version: 2.2.2
         | 
| 120 114 | 
             
            signing_key: 
         | 
| 121 | 
            -
            specification_version:  | 
| 115 | 
            +
            specification_version: 4
         | 
| 122 116 | 
             
            summary: A document vector search with flexible matrix transforms. Currently supports
         | 
| 123 117 | 
             
              Latent semantic analysis and Term frequency - inverse document frequency
         | 
| 124 118 | 
             
            test_files: []
         | 
    
        data/lib/semantic/transform.rb
    DELETED
    
    | @@ -1 +0,0 @@ | |
| 1 | 
            -
            %w{tf_idf lsa}.each{|f| require "semantic/transform/#{f}_transform.rb"}
         | 
| @@ -1 +0,0 @@ | |
| 1 | 
            -
            %w{model builder}.each{|f| require "semantic/vector_space/#{f}"}
         |