rblearn 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/lib/rblearn/CountVectorizer.rb +83 -0
 - data/lib/rblearn/CrossValidation.rb +45 -0
 - data/lib/rblearn/version.rb +1 -1
 - data/lib/rblearn.rb +6 -1
 - data/rblearn.gemspec +4 -0
 - metadata +46 -1
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA1:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: e421c233da7861bb3062a93b964b9cc4a3b23f1c
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: b2be5eb9d5f61bb7a70bda13be2d876afd509dbc
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 6b2d6557d3a07864fe0a50a892f0ea7540fe44a585aaf451afdd220255c11fbc1bf70f9429af955b8b6b8958d4dcb157df46ae67cd0ac1d0091810489c857cc2
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: eb58039f44c2584d81b524c9dc6faaf24a32ffbf3b31d33a61817e668ede474fdd021fd315c69cfcced7fa4dc74a648b2f696ab32fed50642508e6d4f7a91de7
         
     | 
| 
         @@ -0,0 +1,83 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
             
     | 
| 
      
 2 
     | 
    
         
            +
            module Rblearn
         
     | 
| 
      
 3 
     | 
    
         
            +
             
     | 
| 
      
 4 
     | 
    
         
            +
            	class CountVectorizer
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
                # TODO: consider the access controll about all variables
         
     | 
| 
      
 7 
     | 
    
         
            +
            		attr_accessor :feature_names, :doc_matrix, :token2index
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
            		# tokenizer: lambda function :: string -> Array<string>
         
     | 
| 
      
 10 
     | 
    
         
            +
            		# lowcase: whether if words are lowercases :: bool
         
     | 
| 
      
 11 
     | 
    
         
            +
            		# stop_words: list of stop words :: Array<string>
         
     | 
| 
      
 12 
     | 
    
         
            +
            		# max_features: limitation of feature size :: Float \in [0, 1]
         
     | 
| 
      
 13 
     | 
    
         
            +
            		# TODO: by max_features, zero vectors are sometimes created.
         
     | 
| 
      
 14 
     | 
    
         
            +
            		def initialize(tokenizer, lowercase=true, max_features=0.8)
         
     | 
| 
      
 15 
     | 
    
         
            +
            			@tokenizer = tokenizer
         
     | 
| 
      
 16 
     | 
    
         
            +
            			@lowercase = lowercase
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
      
 18 
     | 
    
         
            +
            			stop_words = Stopwords::STOP_WORDS
         
     | 
| 
      
 19 
     | 
    
         
            +
            			stop_words.map! {|token| token.stem}
         
     | 
| 
      
 20 
     | 
    
         
            +
            			stop_words.map! {|token| token.downcase} if @lowercase
         
     | 
| 
      
 21 
     | 
    
         
            +
            			@stopwords = stop_words
         
     | 
| 
      
 22 
     | 
    
         
            +
            			@max_feature = max_features
         
     | 
| 
      
 23 
     | 
    
         
            +
            		end
         
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
            		# features: Each documents' feature :: Array<String> -> NArray::Int64
         
     | 
| 
      
 26 
     | 
    
         
            +
            		def fit_transform(features)
         
     | 
| 
      
 27 
     | 
    
         
            +
            			all_vocaburaries = []
         
     | 
| 
      
 28 
     | 
    
         
            +
            			word_frequency = Hash.new{|hash, key| hash[key] = 0}
         
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
      
 30 
     | 
    
         
            +
            			features.each do |feature|
         
     | 
| 
      
 31 
     | 
    
         
            +
            				@tokenizer.call(feature).each do |token|
         
     | 
| 
      
 32 
     | 
    
         
            +
            					token.downcase! if @lowercase
         
     | 
| 
      
 33 
     | 
    
         
            +
            					all_vocaburaries << token
         
     | 
| 
      
 34 
     | 
    
         
            +
            					word_frequency[token] += 1
         
     | 
| 
      
 35 
     | 
    
         
            +
            				end
         
     | 
| 
      
 36 
     | 
    
         
            +
            			end
         
     | 
| 
      
 37 
     | 
    
         
            +
             
     | 
| 
      
 38 
     | 
    
         
            +
            			all_vocaburaries.uniq!
         
     | 
| 
      
 39 
     | 
    
         
            +
            			word_frequency =  word_frequency.sort{|(_, value1), (_, value2)| value2 <=> value1}
         
     | 
| 
      
 40 
     | 
    
         
            +
            			feature_names = (0...(word_frequency.size * @max_feature).to_i).map{|i| word_frequency[i][0]}
         
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
      
 42 
     | 
    
         
            +
            			token2index = {}
         
     | 
| 
      
 43 
     | 
    
         
            +
            			feature_names.each_with_index do |token, i|
         
     | 
| 
      
 44 
     | 
    
         
            +
            				token2index[token] = i
         
     | 
| 
      
 45 
     | 
    
         
            +
            			end
         
     | 
| 
      
 46 
     | 
    
         
            +
             
     | 
| 
      
 47 
     | 
    
         
            +
            			doc_matrix = Numo::Int32.zeros([features.size, feature_names.size])
         
     | 
| 
      
 48 
     | 
    
         
            +
            			features.each_with_index do |feature, doc_id|
         
     | 
| 
      
 49 
     | 
    
         
            +
            				tokens = []
         
     | 
| 
      
 50 
     | 
    
         
            +
            				@tokenizer.call(feature).each do |token|
         
     | 
| 
      
 51 
     | 
    
         
            +
            					token.downcase! if @lowercase
         
     | 
| 
      
 52 
     | 
    
         
            +
            					tokens << token unless @stopwords.include?(token)
         
     | 
| 
      
 53 
     | 
    
         
            +
            				end
         
     | 
| 
      
 54 
     | 
    
         
            +
             
     | 
| 
      
 55 
     | 
    
         
            +
            				# BoW representation
         
     | 
| 
      
 56 
     | 
    
         
            +
            				counter = Hash.new{|hash, key| hash[key] = 0}
         
     | 
| 
      
 57 
     | 
    
         
            +
            				tokens.each do |token|
         
     | 
| 
      
 58 
     | 
    
         
            +
            					counter[token] += 1
         
     | 
| 
      
 59 
     | 
    
         
            +
            				end
         
     | 
| 
      
 60 
     | 
    
         
            +
             
     | 
| 
      
 61 
     | 
    
         
            +
            				counter.each do |token, freq|
         
     | 
| 
      
 62 
     | 
    
         
            +
            					doc_matrix[doc_id, token2index[token]] = freq if token2index[token]
         
     | 
| 
      
 63 
     | 
    
         
            +
            				end
         
     | 
| 
      
 64 
     | 
    
         
            +
            			end
         
     | 
| 
      
 65 
     | 
    
         
            +
             
     | 
| 
      
 66 
     | 
    
         
            +
            			@doc_matrix = doc_matrix
         
     | 
| 
      
 67 
     | 
    
         
            +
            			@feature_names = feature_names
         
     | 
| 
      
 68 
     | 
    
         
            +
            			@token2index = token2index
         
     | 
| 
      
 69 
     | 
    
         
            +
            			return @doc_matrix
         
     | 
| 
      
 70 
     | 
    
         
            +
            		end
         
     | 
| 
      
 71 
     | 
    
         
            +
            	end
         
     | 
| 
      
 72 
     | 
    
         
            +
             
     | 
| 
      
 73 
     | 
    
         
            +
             
     | 
| 
      
 74 
     | 
    
         
            +
             
     | 
| 
      
 75 
     | 
    
         
            +
            	if __FILE__ == $0
         
     | 
| 
      
 76 
     | 
    
         
            +
            		cv = CountVectorizer.new(lambda{|s| s.split.map{|token| token.stem}}, 1, 0.8)
         
     | 
| 
      
 77 
     | 
    
         
            +
            		features = ['I am train man which automata and philosophy', 'numerical analysis young man', 'logic programmer']
         
     | 
| 
      
 78 
     | 
    
         
            +
            		p cv.fit_transform(features)
         
     | 
| 
      
 79 
     | 
    
         
            +
            		p cv.feature_names
         
     | 
| 
      
 80 
     | 
    
         
            +
            		p cv.token2index
         
     | 
| 
      
 81 
     | 
    
         
            +
            	end
         
     | 
| 
      
 82 
     | 
    
         
            +
             
     | 
| 
      
 83 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,45 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
             
     | 
| 
      
 2 
     | 
    
         
            +
            module Rblearn
         
     | 
| 
      
 3 
     | 
    
         
            +
              module CrossValidation
         
     | 
| 
      
 4 
     | 
    
         
            +
                # x, y: Narray object
         
     | 
| 
      
 5 
     | 
    
         
            +
                # We slice a matrix by x[Array<Integer>, true]
         
     | 
| 
      
 6 
     | 
    
         
            +
                def self.train_test_split(x, y, test_size=0.33)
         
     | 
| 
      
 7 
     | 
    
         
            +
                  doc_size = x.shape[0]
         
     | 
| 
      
 8 
     | 
    
         
            +
                  random_indices = (0...doc_size).to_a.shuffle
         
     | 
| 
      
 9 
     | 
    
         
            +
                  endpoint = (doc_size * test_size).to_i
         
     | 
| 
      
 10 
     | 
    
         
            +
                  train_indices = random_indices[endpoint..-1]
         
     | 
| 
      
 11 
     | 
    
         
            +
                  test_indices = random_indices[0...endpoint]
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
                  return [x[train_indices, true], y[train_indices, true], x[test_indices, true], y[test_indices, true]]
         
     | 
| 
      
 14 
     | 
    
         
            +
                end
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
                class KFold
         
     | 
| 
      
 17 
     | 
    
         
            +
                  # TODO: make indices and n_folds private
         
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
                  def initialize(n, n_folds, shuffle)
         
     | 
| 
      
 20 
     | 
    
         
            +
                    indices = (0...n).to_a
         
     | 
| 
      
 21 
     | 
    
         
            +
                    indices.shuffle! if shuffle
         
     | 
| 
      
 22 
     | 
    
         
            +
                    @indices = indices
         
     | 
| 
      
 23 
     | 
    
         
            +
                    @n_folds = n_folds
         
     | 
| 
      
 24 
     | 
    
         
            +
                  end
         
     | 
| 
      
 25 
     | 
    
         
            +
             
     | 
| 
      
 26 
     | 
    
         
            +
                  def create
         
     | 
| 
      
 27 
     | 
    
         
            +
                    groups_nfolds = @indices.each_slice((@indices.size.to_f / @n_folds).ceil).to_a
         
     | 
| 
      
 28 
     | 
    
         
            +
                    groups = []
         
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
      
 30 
     | 
    
         
            +
                    @n_folds.times do |k|
         
     | 
| 
      
 31 
     | 
    
         
            +
                      validation_set = []
         
     | 
| 
      
 32 
     | 
    
         
            +
                      test_set = []
         
     | 
| 
      
 33 
     | 
    
         
            +
             
     | 
| 
      
 34 
     | 
    
         
            +
                      @n_folds.times do |j|
         
     | 
| 
      
 35 
     | 
    
         
            +
                        test_set += groups_nfolds[j] if k == j
         
     | 
| 
      
 36 
     | 
    
         
            +
                        validation_set += groups_nfolds[j] unless k == j
         
     | 
| 
      
 37 
     | 
    
         
            +
                      end
         
     | 
| 
      
 38 
     | 
    
         
            +
                      groups << [validation_set, test_set]
         
     | 
| 
      
 39 
     | 
    
         
            +
                    end
         
     | 
| 
      
 40 
     | 
    
         
            +
             
     | 
| 
      
 41 
     | 
    
         
            +
                    return groups
         
     | 
| 
      
 42 
     | 
    
         
            +
                  end
         
     | 
| 
      
 43 
     | 
    
         
            +
                end
         
     | 
| 
      
 44 
     | 
    
         
            +
              end
         
     | 
| 
      
 45 
     | 
    
         
            +
            end
         
     | 
    
        data/lib/rblearn/version.rb
    CHANGED
    
    
    
        data/lib/rblearn.rb
    CHANGED
    
    
    
        data/rblearn.gemspec
    CHANGED
    
    | 
         @@ -25,4 +25,8 @@ Gem::Specification.new do |spec| 
     | 
|
| 
       25 
25 
     | 
    
         
             
              spec.add_development_dependency "bundler", "~> 1.11"
         
     | 
| 
       26 
26 
     | 
    
         
             
              spec.add_development_dependency "rake", "~> 10.0"
         
     | 
| 
       27 
27 
     | 
    
         
             
              spec.add_development_dependency "rspec", "~> 3.0"
         
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
      
 29 
     | 
    
         
            +
              spec.add_runtime_dependency "numo-narray"
         
     | 
| 
      
 30 
     | 
    
         
            +
              spec.add_runtime_dependency 'stopwords'
         
     | 
| 
      
 31 
     | 
    
         
            +
              spec.add_runtime_dependency 'stemmify'
         
     | 
| 
       28 
32 
     | 
    
         
             
            end
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: rblearn
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.2.0
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - himkt
         
     | 
| 
         @@ -52,6 +52,48 @@ dependencies: 
     | 
|
| 
       52 
52 
     | 
    
         
             
                - - "~>"
         
     | 
| 
       53 
53 
     | 
    
         
             
                  - !ruby/object:Gem::Version
         
     | 
| 
       54 
54 
     | 
    
         
             
                    version: '3.0'
         
     | 
| 
      
 55 
     | 
    
         
            +
            - !ruby/object:Gem::Dependency
         
     | 
| 
      
 56 
     | 
    
         
            +
              name: numo-narray
         
     | 
| 
      
 57 
     | 
    
         
            +
              requirement: !ruby/object:Gem::Requirement
         
     | 
| 
      
 58 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 59 
     | 
    
         
            +
                - - ">="
         
     | 
| 
      
 60 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 61 
     | 
    
         
            +
                    version: '0'
         
     | 
| 
      
 62 
     | 
    
         
            +
              type: :runtime
         
     | 
| 
      
 63 
     | 
    
         
            +
              prerelease: false
         
     | 
| 
      
 64 
     | 
    
         
            +
              version_requirements: !ruby/object:Gem::Requirement
         
     | 
| 
      
 65 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 66 
     | 
    
         
            +
                - - ">="
         
     | 
| 
      
 67 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 68 
     | 
    
         
            +
                    version: '0'
         
     | 
| 
      
 69 
     | 
    
         
            +
            - !ruby/object:Gem::Dependency
         
     | 
| 
      
 70 
     | 
    
         
            +
              name: stopwords
         
     | 
| 
      
 71 
     | 
    
         
            +
              requirement: !ruby/object:Gem::Requirement
         
     | 
| 
      
 72 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 73 
     | 
    
         
            +
                - - ">="
         
     | 
| 
      
 74 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 75 
     | 
    
         
            +
                    version: '0'
         
     | 
| 
      
 76 
     | 
    
         
            +
              type: :runtime
         
     | 
| 
      
 77 
     | 
    
         
            +
              prerelease: false
         
     | 
| 
      
 78 
     | 
    
         
            +
              version_requirements: !ruby/object:Gem::Requirement
         
     | 
| 
      
 79 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 80 
     | 
    
         
            +
                - - ">="
         
     | 
| 
      
 81 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 82 
     | 
    
         
            +
                    version: '0'
         
     | 
| 
      
 83 
     | 
    
         
            +
            - !ruby/object:Gem::Dependency
         
     | 
| 
      
 84 
     | 
    
         
            +
              name: stemmify
         
     | 
| 
      
 85 
     | 
    
         
            +
              requirement: !ruby/object:Gem::Requirement
         
     | 
| 
      
 86 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 87 
     | 
    
         
            +
                - - ">="
         
     | 
| 
      
 88 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 89 
     | 
    
         
            +
                    version: '0'
         
     | 
| 
      
 90 
     | 
    
         
            +
              type: :runtime
         
     | 
| 
      
 91 
     | 
    
         
            +
              prerelease: false
         
     | 
| 
      
 92 
     | 
    
         
            +
              version_requirements: !ruby/object:Gem::Requirement
         
     | 
| 
      
 93 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 94 
     | 
    
         
            +
                - - ">="
         
     | 
| 
      
 95 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 96 
     | 
    
         
            +
                    version: '0'
         
     | 
| 
       55 
97 
     | 
    
         
             
            description: rblearn (ruby-learn) provides methods for feature extracting and some
         
     | 
| 
       56 
98 
     | 
    
         
             
              algorithms.
         
     | 
| 
       57 
99 
     | 
    
         
             
            email:
         
     | 
| 
         @@ -71,6 +113,8 @@ files: 
     | 
|
| 
       71 
113 
     | 
    
         
             
            - bin/console
         
     | 
| 
       72 
114 
     | 
    
         
             
            - bin/setup
         
     | 
| 
       73 
115 
     | 
    
         
             
            - lib/rblearn.rb
         
     | 
| 
      
 116 
     | 
    
         
            +
            - lib/rblearn/CountVectorizer.rb
         
     | 
| 
      
 117 
     | 
    
         
            +
            - lib/rblearn/CrossValidation.rb
         
     | 
| 
       74 
118 
     | 
    
         
             
            - lib/rblearn/version.rb
         
     | 
| 
       75 
119 
     | 
    
         
             
            - rblearn.gemspec
         
     | 
| 
       76 
120 
     | 
    
         
             
            homepage: https://github.com/himkt/rblearn
         
     | 
| 
         @@ -98,3 +142,4 @@ signing_key: 
     | 
|
| 
       98 
142 
     | 
    
         
             
            specification_version: 4
         
     | 
| 
       99 
143 
     | 
    
         
             
            summary: Simple repository for machine learning
         
     | 
| 
       100 
144 
     | 
    
         
             
            test_files: []
         
     | 
| 
      
 145 
     | 
    
         
            +
            has_rdoc: 
         
     |