jruby_mahout 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +6 -0
- data/.rspec +2 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +27 -0
- data/MIT-LICENSE +20 -0
- data/README.md +30 -0
- data/Rakefile +6 -0
- data/jruby_mahout.gemspec +22 -0
- data/lib/jruby_mahout.rb +8 -0
- data/lib/jruby_mahout/data_model.rb +20 -0
- data/lib/jruby_mahout/evaluator.rb +18 -0
- data/lib/jruby_mahout/mahout_imports.rb +34 -0
- data/lib/jruby_mahout/mysql_manager.rb +5 -0
- data/lib/jruby_mahout/postgres_manager.rb +76 -0
- data/lib/jruby_mahout/recommender.rb +78 -0
- data/lib/jruby_mahout/recommender_builder.rb +56 -0
- data/lib/jruby_mahout/version.rb +3 -0
- data/spec/recommender_data.csv +10000 -0
- data/spec/recommender_spec.rb +296 -0
- data/spec/spec_helper.rb +7 -0
- metadata +110 -0
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
jruby_mahout (0.2.0)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: http://rubygems.org/
|
8
|
+
specs:
|
9
|
+
diff-lcs (1.1.3)
|
10
|
+
rake (10.0.1)
|
11
|
+
rspec (2.12.0)
|
12
|
+
rspec-core (~> 2.12.0)
|
13
|
+
rspec-expectations (~> 2.12.0)
|
14
|
+
rspec-mocks (~> 2.12.0)
|
15
|
+
rspec-core (2.12.0)
|
16
|
+
rspec-expectations (2.12.0)
|
17
|
+
diff-lcs (~> 1.1.3)
|
18
|
+
rspec-mocks (2.12.0)
|
19
|
+
|
20
|
+
PLATFORMS
|
21
|
+
java
|
22
|
+
ruby
|
23
|
+
|
24
|
+
DEPENDENCIES
|
25
|
+
jruby_mahout!
|
26
|
+
rake
|
27
|
+
rspec
|
data/MIT-LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright 2012 Vasily Vasinov
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# Jruby Mahout
|
2
|
+
Jruby Mahout is a gem that unleashes the power of Apache Mahout in the world of Jruby. Mahout is a superior machine learning library written in Java. It deals with recommendations, clustering and classification machine learning problems at scale. Until now it was difficult to use it in Ruby projects. You'd have to implement Java interfaces in Jruby yourself, which is not quick especially if you just started exploring the world of machine learning.
|
3
|
+
|
4
|
+
The goal of this library is to make machine learning at scale in Jruby projects simple.
|
5
|
+
|
6
|
+
## Quick Overview
|
7
|
+
This is an early version of a Jruby gem that only supports Mahout recommendations. It also includes a simple Postgres manager that can be used to manage appropriate recommendations tables. Unfortunately it's impossible to use ActiveRecord (AR) with Mahout, because AR at a mach higher level and creates a lot of overhead that is critical when dealing with millions of records in real time.
|
8
|
+
|
9
|
+
## Get Mahout
|
10
|
+
First of all you need to download Mahout library from one of the [mirrors](http://www.apache.org/dyn/closer.cgi/mahout/). Jruby Mahout only supports Mahout 0.7 at this point.
|
11
|
+
|
12
|
+
## Get Postgres JDBC Adapter
|
13
|
+
If you wish to work with a database for recommendations, you'll have to install [JDBC driver for Postgres](http://jdbc.postgresql.org/download.html). Another option is to use file-based recommendation.
|
14
|
+
|
15
|
+
## Installation
|
16
|
+
### 1. Set environment variable MAHOUT_DIR to point at your Mahout installation.
|
17
|
+
### 2. Add the gem to your `Gemfile`
|
18
|
+
```ruby
|
19
|
+
platform :jruby do
|
20
|
+
gem "jruby_mahout"
|
21
|
+
end
|
22
|
+
```
|
23
|
+
And run `bundle install`.
|
24
|
+
|
25
|
+
## Contribute
|
26
|
+
- Fork the project.
|
27
|
+
- Write code for a feature or bug fix.
|
28
|
+
- Add Rspec tests for it.
|
29
|
+
- Commit, do not make changes to rakefile or version.
|
30
|
+
- Submit a pull request.
|
data/Rakefile
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
$LOAD_PATH << File.expand_path("../lib", __FILE__)
|
2
|
+
require "jruby_mahout/version"
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.name = "jruby_mahout"
|
6
|
+
gem.version = JrubyMahout::VERSION
|
7
|
+
gem.authors = ["Vasily Vasinov"]
|
8
|
+
gem.email = ["vasinov@me.com"]
|
9
|
+
gem.homepage = "https://github.com/vasinov/jruby_mahout"
|
10
|
+
gem.summary = "Jruby Mahout is a gem that unleashes the power of Apache Mahout in the world of Jruby."
|
11
|
+
gem.description = "Jruby Mahout is a gem that unleashes the power of Apache Mahout in the world of Jruby. Mahout is a superior machine learning library written in Java. It deals with recommendations, clustering and classification machine learning problems at scale. Until now it was difficult to use it in Ruby projects. You'd have to implement Java interfaces in Jruby yourself, which is not quick especially if you just started exploring the world of machine learning."
|
12
|
+
gem.license = "MIT"
|
13
|
+
|
14
|
+
gem.files = Dir["{lib}/**/*"] + ["MIT-LICENSE", "README.md"]
|
15
|
+
gem.test_files = Dir["spec/**/*"]
|
16
|
+
gem.files = `git ls-files`.split($/)
|
17
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
18
|
+
gem.require_paths = ["lib"]
|
19
|
+
|
20
|
+
gem.add_development_dependency "rake"
|
21
|
+
gem.add_development_dependency "rspec"
|
22
|
+
end
|
data/lib/jruby_mahout.rb
ADDED
@@ -0,0 +1,8 @@
|
|
1
|
+
module JrubyMahout
|
2
|
+
require 'java'
|
3
|
+
require File.join(ENV["MAHOUT_DIR"], 'mahout-core-0.7.jar')
|
4
|
+
require File.join(ENV["MAHOUT_DIR"], 'mahout-integration-0.7.jar')
|
5
|
+
require File.join(ENV["MAHOUT_DIR"], 'mahout-math-0.7.jar')
|
6
|
+
Dir.glob(File.join(ENV["MAHOUT_DIR"], 'lib/*.jar')).each { |d| require d }
|
7
|
+
Dir['./lib/jruby_mahout/*.rb'].each{ |f| require f }
|
8
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module JrubyMahout
|
2
|
+
class DataModel
|
3
|
+
attr_accessor :data_model
|
4
|
+
|
5
|
+
def initialize(data_model_type, params)
|
6
|
+
case data_model_type
|
7
|
+
when "file"
|
8
|
+
@data_model = FileDataModel.new(java.io.File.new(params[:file_path]))
|
9
|
+
when "mysql"
|
10
|
+
# TODO: implement
|
11
|
+
@data_model = nil
|
12
|
+
when "postgres"
|
13
|
+
@data_model = PostgresManager.new(params).setup_data_model(params)
|
14
|
+
else
|
15
|
+
@data_model = nil
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module JrubyMahout
|
2
|
+
class Evaluator
|
3
|
+
def initialize(data_model, recommender_builder)
|
4
|
+
@data_model = data_model
|
5
|
+
@recommender_builder = recommender_builder
|
6
|
+
@mahout_evaluator = AverageAbsoluteDifferenceRecommenderEvaluator.new()
|
7
|
+
end
|
8
|
+
|
9
|
+
def evaluate(training_percentage, evaluation_percentage)
|
10
|
+
if @recommender_builder.recommender_name == "GenericItemBasedRecommender" and !@recommender_builder.item_based_allowed
|
11
|
+
nil
|
12
|
+
else
|
13
|
+
Float(@mahout_evaluator.evaluate(@recommender_builder, nil, @data_model, training_percentage, evaluation_percentage))
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# Recommenders
|
2
|
+
java_import org.apache.mahout.cf.taste.eval.RecommenderBuilder
|
3
|
+
java_import org.apache.mahout.cf.taste.impl.similarity.PearsonCorrelationSimilarity
|
4
|
+
java_import org.apache.mahout.cf.taste.impl.similarity.EuclideanDistanceSimilarity
|
5
|
+
java_import org.apache.mahout.cf.taste.impl.similarity.SpearmanCorrelationSimilarity
|
6
|
+
java_import org.apache.mahout.cf.taste.impl.similarity.LogLikelihoodSimilarity
|
7
|
+
java_import org.apache.mahout.cf.taste.impl.similarity.TanimotoCoefficientSimilarity
|
8
|
+
java_import org.apache.mahout.cf.taste.impl.similarity.PearsonCorrelationSimilarity
|
9
|
+
|
10
|
+
# Neighborhoods
|
11
|
+
java_import org.apache.mahout.cf.taste.impl.neighborhood.NearestNUserNeighborhood
|
12
|
+
|
13
|
+
# Recommenders
|
14
|
+
java_import org.apache.mahout.cf.taste.impl.recommender.GenericUserBasedRecommender
|
15
|
+
java_import org.apache.mahout.cf.taste.impl.recommender.GenericItemBasedRecommender
|
16
|
+
java_import org.apache.mahout.cf.taste.impl.recommender.slopeone.SlopeOneRecommender
|
17
|
+
|
18
|
+
# Weighting
|
19
|
+
java_import org.apache.mahout.cf.taste.common.Weighting
|
20
|
+
|
21
|
+
# Evaluators
|
22
|
+
java_import org.apache.mahout.cf.taste.impl.eval.AverageAbsoluteDifferenceRecommenderEvaluator
|
23
|
+
|
24
|
+
# Data Models
|
25
|
+
java_import org.apache.mahout.cf.taste.impl.model.jdbc.PostgreSQLJDBCDataModel
|
26
|
+
java_import org.apache.mahout.cf.taste.impl.model.file.FileDataModel
|
27
|
+
|
28
|
+
|
29
|
+
# Postgres
|
30
|
+
begin
|
31
|
+
java_import org.postgresql.ds.PGPoolingDataSource
|
32
|
+
rescue Exception => e
|
33
|
+
puts e
|
34
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
module JrubyMahout
|
2
|
+
class PostgresManager
|
3
|
+
attr_accessor :data_model, :data_source, :statement
|
4
|
+
|
5
|
+
def initialize(params)
|
6
|
+
@data_source = PGPoolingDataSource.new()
|
7
|
+
@data_source.setUser(params[:username])
|
8
|
+
@data_source.setPassword(params[:password])
|
9
|
+
@data_source.setServerName(params[:host])
|
10
|
+
@data_source.setPortNumber(params[:port])
|
11
|
+
@data_source.setDatabaseName(params[:db_name])
|
12
|
+
end
|
13
|
+
|
14
|
+
def setup_data_model(params)
|
15
|
+
begin
|
16
|
+
@data_model = PostgreSQLJDBCDataModel.new(@data_source, params[:table_name], "user_id", "item_id", "rating", "created")
|
17
|
+
rescue Exception => e
|
18
|
+
puts e
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def create_statement
|
23
|
+
begin
|
24
|
+
connection = @data_source.getConnection()
|
25
|
+
@statement = connection.createStatement()
|
26
|
+
rescue Exception => e
|
27
|
+
puts e
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def close_data_source
|
32
|
+
begin
|
33
|
+
@data_source.close()
|
34
|
+
rescue Exception => e
|
35
|
+
puts e
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def upsert_record(record, name)
|
40
|
+
begin
|
41
|
+
@statement.execute("UPDATE #{name} SET user_id=#{record[:user_id]}, item_id=#{record[:item_id]}, rating=#{record[:rating]} WHERE user_id=#{record[:user_id]} AND item_id=#{record[:item_id]};")
|
42
|
+
@statement.execute("INSERT INTO #{name} (user_id, item_id, rating) SELECT #{record[:user_id]}, #{record[:item_id]}, #{record[:rating]} WHERE NOT EXISTS (SELECT 1 FROM #{name} WHERE user_id=#{record[:user_id]} AND item_id=#{record[:item_id]});")
|
43
|
+
rescue java.sql.SQLException => e
|
44
|
+
puts e
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def create_table(name)
|
49
|
+
begin
|
50
|
+
@statement.executeUpdate("
|
51
|
+
CREATE TABLE #{name} (
|
52
|
+
user_id BIGINT NOT NULL,
|
53
|
+
item_id BIGINT NOT NULL,
|
54
|
+
rating int NOT NULL,
|
55
|
+
created TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
56
|
+
PRIMARY KEY (user_id, item_id)
|
57
|
+
);
|
58
|
+
")
|
59
|
+
@statement.executeUpdate("CREATE INDEX #{name}_user_id_index ON #{name} (user_id);")
|
60
|
+
@statement.executeUpdate("CREATE INDEX #{name}_item_id_index ON #{name} (item_id);")
|
61
|
+
rescue java.sql.SQLException => e
|
62
|
+
puts e
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def delete_table(name)
|
67
|
+
begin
|
68
|
+
@statement.executeUpdate("DROP INDEX IF EXISTS #{name}_user_id_index;")
|
69
|
+
@statement.executeUpdate("DROP INDEX IF EXISTS #{name}_item_id_index;")
|
70
|
+
@statement.executeUpdate("DROP TABLE IF EXISTS #{name};")
|
71
|
+
rescue java.sql.SQLException => e
|
72
|
+
puts e
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
module JrubyMahout
|
2
|
+
class Recommender
|
3
|
+
attr_accessor :is_weighted, :neighborhood_size, :similarity_name, :recommender_name, :data_model
|
4
|
+
|
5
|
+
def initialize(similarity_name, neighborhood_size, recommender_name, is_weighted)
|
6
|
+
@is_weighted = is_weighted
|
7
|
+
@neighborhood_size = neighborhood_size
|
8
|
+
@similarity_name = similarity_name
|
9
|
+
@recommender_name = recommender_name
|
10
|
+
@recommender_builder = RecommenderBuilder.new(@similarity_name,
|
11
|
+
@neighborhood_size,
|
12
|
+
@recommender_name,
|
13
|
+
@is_weighted)
|
14
|
+
@data_model = nil
|
15
|
+
@recommender = nil
|
16
|
+
end
|
17
|
+
|
18
|
+
def data_model=(data_model)
|
19
|
+
@data_model = data_model
|
20
|
+
@recommender = @recommender_builder.buildRecommender(@data_model)
|
21
|
+
end
|
22
|
+
|
23
|
+
def recommend(user_id, number_of_items, rescorer)
|
24
|
+
if @recommender.nil?
|
25
|
+
nil
|
26
|
+
else
|
27
|
+
recommendations_to_array(@recommender.recommend(user_id, number_of_items, rescorer))
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def evaluate(training_percentage, evaluation_percentage)
|
32
|
+
evaluator = Evaluator.new(@data_model, @recommender_builder)
|
33
|
+
evaluator.evaluate(training_percentage, evaluation_percentage)
|
34
|
+
end
|
35
|
+
|
36
|
+
def similar_items(item_id, number_of_items, rescorer)
|
37
|
+
if @recommender.nil? or @recommender_name == "GenericItemBasedRecommender"
|
38
|
+
nil
|
39
|
+
else
|
40
|
+
@recommender.mostSimilarItems(item_id, number_of_items, rescorer)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def similar_users(user_id, number_of_items, rescorer)
|
45
|
+
if @recommender.nil? or @recommender_name == "GenericUserBasedRecommender"
|
46
|
+
nil
|
47
|
+
else
|
48
|
+
@recommender.mostSimilarUserIDs(user_id, amount, rescorer)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def estimate_preference(user_id, item_id)
|
53
|
+
if @recommender.nil?
|
54
|
+
nil
|
55
|
+
else
|
56
|
+
@recommender.estimatePreference(user_id, item_id)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def recommended_because(user_id, item_id, number_of_items)
|
61
|
+
if @recommender.nil? or @recommender_name == "GenericItemBasedRecommender"
|
62
|
+
nil
|
63
|
+
else
|
64
|
+
@recommender.recommendedBecause(user_id, item_id, number_of_items)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
private
|
69
|
+
def recommendations_to_array(recommendations)
|
70
|
+
recommendations_array = []
|
71
|
+
recommendations.each do |recommendation|
|
72
|
+
recommendations_array << [recommendation.getItemID, recommendation.getValue.round(5)]
|
73
|
+
end
|
74
|
+
|
75
|
+
recommendations_array
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
module JrubyMahout
|
2
|
+
class RecommenderBuilder
|
3
|
+
attr_accessor :recommender_name, :item_based_allowed
|
4
|
+
# public interface RecommenderBuilder
|
5
|
+
# Implementations of this inner interface are simple helper classes which create a Recommender to be evaluated based on the given DataModel.
|
6
|
+
def initialize(similarity_name, neighborhood_size, recommender_name, is_weighted)
|
7
|
+
@is_weighted = is_weighted
|
8
|
+
@neighborhood_size = neighborhood_size
|
9
|
+
@similarity_name = similarity_name
|
10
|
+
@recommender_name = recommender_name
|
11
|
+
@item_based_allowed = (@similarity_name == "SpearmanCorrelationSimilarity") ? false : true
|
12
|
+
end
|
13
|
+
|
14
|
+
# buildRecommender(DataModel dataModel)
|
15
|
+
# Builds a Recommender implementation to be evaluated, using the given DataModel.
|
16
|
+
def buildRecommender(data_model)
|
17
|
+
begin
|
18
|
+
case @similarity_name
|
19
|
+
when "PearsonCorrelationSimilarity"
|
20
|
+
similarity = (@is_weighted) ? PearsonCorrelationSimilarity.new(data_model, Weighting::WEIGHTED) : PearsonCorrelationSimilarity.new(data_model)
|
21
|
+
when "EuclideanDistanceSimilarity"
|
22
|
+
similarity = (@is_weighted) ? EuclideanDistanceSimilarity.new(data_model, Weighting::WEIGHTED) : EuclideanDistanceSimilarity.new(data_model)
|
23
|
+
when "SpearmanCorrelationSimilarity"
|
24
|
+
similarity = SpearmanCorrelationSimilarity.new(data_model)
|
25
|
+
when "LogLikelihoodSimilarity"
|
26
|
+
similarity = LogLikelihoodSimilarity.new(data_model)
|
27
|
+
when "TanimotoCoefficientSimilarity"
|
28
|
+
similarity = TanimotoCoefficientSimilarity.new(data_model)
|
29
|
+
when "GenericItemSimilarity"
|
30
|
+
similarity = PearsonCorrelationSimilarity.new(data_model, Weighting::WEIGHTED)
|
31
|
+
else
|
32
|
+
similarity = nil
|
33
|
+
end
|
34
|
+
|
35
|
+
unless @neighborhood_size.nil?
|
36
|
+
neighborhood = NearestNUserNeighborhood.new(Integer(@neighborhood_size), similarity, data_model)
|
37
|
+
end
|
38
|
+
|
39
|
+
case @recommender_name
|
40
|
+
when "GenericUserBasedRecommender"
|
41
|
+
recommender = GenericUserBasedRecommender.new(data_model, neighborhood, similarity)
|
42
|
+
when "GenericItemBasedRecommender"
|
43
|
+
recommender = (@item_based_allowed) ? GenericItemBasedRecommender.new(data_model, similarity) : nil
|
44
|
+
when "SlopeOneRecommender"
|
45
|
+
recommender = SlopeOneRecommender.new(data_model)
|
46
|
+
else
|
47
|
+
recommender = nil
|
48
|
+
end
|
49
|
+
|
50
|
+
recommender
|
51
|
+
rescue Exception => e
|
52
|
+
return e
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|