movieDB 0.2.2 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/.gitignore +1 -0
- data/.travis.yml +30 -0
- data/README.md +80 -74
- data/lib/movieDB.rb +225 -268
- data/lib/movieDB/base.rb +4 -5
- data/lib/movieDB/data_analysis.rb +59 -89
- data/lib/movieDB/data_export.rb +20 -21
- data/lib/movieDB/data_process.rb +1 -0
- data/lib/movieDB/genres/en.txt +0 -1
- data/lib/movieDB/person.rb +27 -28
- data/lib/movieDB/status_checker.rb +30 -32
- data/lib/movieDB/version.rb +1 -1
- data/movieDB.gemspec +1 -1
- data/spec/data_process_spec.rb +1 -1
- data/spec/movieDB_spec.rb +5 -7
- data/spec/person_spec.rb +14 -14
- metadata +6 -28
- data/npm-debug.log +0 -0
data/lib/movieDB/base.rb
CHANGED
@@ -3,14 +3,13 @@ require 'MovieDB/status_checker'
|
|
3
3
|
require 'MovieDB/movie_error'
|
4
4
|
|
5
5
|
module MovieDB #:nodoc
|
6
|
-
# MoviesDB
|
7
|
-
# mathematical computations for analyzing film data from imdb.
|
8
|
-
#
|
9
|
-
|
6
|
+
# MoviesDB is not a datastore gem. Rather, it is a high-level statistical software that performs
|
7
|
+
# mathematical computations for analyzing film data from imdb.
|
8
|
+
# It is a solution to the common problem of deducing logical hypothesis based off movie data.
|
10
9
|
class Base
|
11
10
|
include StatusChecker
|
12
11
|
include MovieError
|
13
12
|
end
|
14
|
-
|
15
13
|
end
|
16
14
|
$:.unshift File.expand_path('..', __FILE__)
|
15
|
+
|
@@ -1,18 +1,15 @@
|
|
1
1
|
require 'MovieDB'
|
2
2
|
|
3
3
|
module MovieDB
|
4
|
-
|
5
|
-
|
6
|
-
# Analysing, inspecting, cleaning, transforming and modeling data.
|
7
|
-
|
4
|
+
# Analyzing, inspecting, cleaning, transforming and modeling data.
|
5
|
+
#
|
8
6
|
class DataAnalysis < MovieDB::Movie
|
9
7
|
module AnalysisOfVariance
|
10
8
|
module LeastSquares
|
11
9
|
module Statistic
|
12
|
-
|
13
10
|
def basic_statistic (directory_name)
|
14
11
|
open_spreadsheet(directory_name)
|
15
|
-
|
12
|
+
|
16
13
|
if check_imdb_count == true
|
17
14
|
puts "*"*41
|
18
15
|
puts "* A minimum of 2 Imdb id's are required *"
|
@@ -29,76 +26,58 @@ module MovieDB
|
|
29
26
|
@book = Spreadsheet.open File.join('reports', directory_name)
|
30
27
|
@sheet = @book.worksheet(0)
|
31
28
|
|
32
|
-
|
33
|
-
# Add document formatting
|
29
|
+
title_format = Spreadsheet::Format.new :color => :blue, :weight => :bold, :size => 13
|
34
30
|
|
35
|
-
|
36
|
-
:weight => :bold,
|
37
|
-
:size => 13
|
38
|
-
|
39
|
-
@sheet.column(22).width = "worldwide_gross".length
|
31
|
+
@sheet.column(22).width = "worldwide_gross".length
|
40
32
|
end
|
41
33
|
|
42
34
|
def check_imdb_count
|
43
35
|
@sheet.rows.count - 1 == 1
|
44
36
|
end
|
45
37
|
|
46
|
-
def perform_computation
|
47
|
-
|
38
|
+
def perform_computation
|
39
|
+
# Perform computation on the data collected.
|
40
|
+
# TODO: Need to use coefficienct statistical formula.
|
41
|
+
# Calculate median as an example but COD formula must be used.
|
42
|
+
# Mean is commonly called as average. Mean or Average is defined as the sum of
|
43
|
+
# all the given elements divided by the total number of elements.
|
44
|
+
#
|
45
|
+
# Range is the difference between the highest and the lowest values in a
|
46
|
+
# frequency distribution.
|
47
|
+
#
|
48
|
+
# Mode is the most frequently occurring value in a frequency distribution.
|
49
|
+
#
|
50
|
+
# Calculate Standard Deviation.
|
51
|
+
# Standard deviation is a statistical measure of spread or variability.
|
52
|
+
#
|
53
|
+
# The standard deviation is the root mean square (RMS) deviation of the
|
54
|
+
# values from their arithmetic mean.
|
48
55
|
total_columns = 22
|
49
56
|
@column = []
|
50
|
-
@row_count = @sheet.rows.count
|
51
57
|
|
52
|
-
|
53
|
-
# Use this total column count to make it dynamic
|
54
|
-
#total_columns = @column_count = @sheet.column_count
|
58
|
+
@row_count = @sheet.rows.count
|
55
59
|
|
56
60
|
1.upto(total_columns) do |c|
|
57
|
-
@column = []
|
58
|
-
|
59
|
-
##
|
60
|
-
# loop through to collect all elements
|
61
|
-
# The returned array includes both strings and integers elements
|
61
|
+
@column = []
|
62
62
|
|
63
63
|
@sheet.each_with_index do |row, i|
|
64
64
|
@column << @sheet[i, 0 + c ]
|
65
65
|
end
|
66
66
|
|
67
|
-
@column.shift
|
68
|
-
@column.compact!
|
69
|
-
row_count = @sheet.rows.count
|
70
|
-
|
71
|
-
##
|
72
|
-
# Perform computation on the data collected
|
73
|
-
# TODO: Need to use coefficienct statistical formula
|
74
|
-
# Calculate median as an example but COD formula must be used
|
67
|
+
@column.shift
|
68
|
+
@column.compact!
|
75
69
|
|
70
|
+
row_count = @sheet.rows.count
|
76
71
|
|
77
|
-
if @column.all? {|i| (1..99999999999).include? (i)}
|
78
|
-
|
72
|
+
if @column.all? { |i| (1..99999999999).include? (i) }
|
79
73
|
n = @column.count
|
80
74
|
@column.sort!
|
81
75
|
|
82
|
-
|
83
|
-
|
84
|
-
# all the given elements divided by the total number of elements.
|
85
|
-
#
|
86
|
-
# Range is the difference between the highest and the lowest values in a
|
87
|
-
# frequency distribution.
|
88
|
-
#
|
89
|
-
# Mode is the most frequently occurring value in a frequency distribution.
|
90
|
-
|
91
|
-
@mean = @column.sum/n # Find the mean
|
92
|
-
@range = @column.max - @column.min # Find the range
|
76
|
+
@mean = @column.sum / n
|
77
|
+
@range = @column.max - @column.min
|
93
78
|
|
94
79
|
freq = @column.inject(Hash.new(0)) { |h, v| h[v] += 1; h }
|
95
|
-
@mode = @column.sort_by { |v| freq[v]}.last # Find the mode
|
96
|
-
|
97
|
-
##
|
98
|
-
# Calculate Standard Deviation
|
99
|
-
# Standard deviation is a statistical measure of spread or variability.
|
100
|
-
# The standard deviation is the root mean square (RMS) deviation of the
|
101
|
-
# values from their arithmetic mean.
|
80
|
+
@mode = @column.sort_by { |v| freq[v] }.last # Find the mode
|
102
81
|
|
103
82
|
@column_squared = []
|
104
83
|
@column.each do |col|
|
@@ -107,15 +86,15 @@ module MovieDB
|
|
107
86
|
|
108
87
|
@sum_of_column = @column.sum
|
109
88
|
@sum_of_column_squared = @column_squared.sum
|
110
|
-
@standard_dev = Math.sqrt((@sum_of_column_squared -((@sum_of_column)*(@sum_of_column)/n))/(n-1))
|
89
|
+
@standard_dev = Math.sqrt((@sum_of_column_squared - ((@sum_of_column) * (@sum_of_column) / n)) / (n - 1))
|
111
90
|
|
112
91
|
if n.odd?
|
113
|
-
index = (n + 1)/2
|
114
|
-
@median = @column[index - 1]
|
92
|
+
index = (n + 1) / 2
|
93
|
+
@median = @column[index - 1]
|
115
94
|
else
|
116
|
-
middle_index = n/2
|
95
|
+
middle_index = n / 2
|
117
96
|
right_index = middle_index + 1
|
118
|
-
@median = (@column[middle_index - 1] + @column[right_index - 1])/2
|
97
|
+
@median = (@column[middle_index - 1] + @column[right_index - 1]) / 2
|
119
98
|
end
|
120
99
|
|
121
100
|
else
|
@@ -126,9 +105,6 @@ module MovieDB
|
|
126
105
|
@standard_dev = "N/A"
|
127
106
|
end
|
128
107
|
|
129
|
-
##
|
130
|
-
# Insert results into spreadsheet cell
|
131
|
-
|
132
108
|
@sheet[@row_count + 2, 0 ] = "Mean"
|
133
109
|
@sheet[@row_count + 2, 0 + c ] = @mean
|
134
110
|
|
@@ -143,7 +119,6 @@ module MovieDB
|
|
143
119
|
|
144
120
|
@sheet[@row_count + 6, 0 ] = "Standard Deviation"
|
145
121
|
@sheet[@row_count + 6, 0 + c ] = @standard_dev
|
146
|
-
|
147
122
|
end
|
148
123
|
end
|
149
124
|
|
@@ -155,13 +130,16 @@ module MovieDB
|
|
155
130
|
end
|
156
131
|
|
157
132
|
def insert_data_to_existing_xls_file
|
158
|
-
|
159
133
|
filename = ("#{report_name}.xls")
|
160
134
|
@book.write File.join('reports', filename)
|
161
135
|
return filename
|
162
136
|
end
|
163
137
|
end
|
164
|
-
|
138
|
+
|
139
|
+
module Coefficient_Of_Determination
|
140
|
+
# TODO: Add code.
|
141
|
+
end
|
142
|
+
|
165
143
|
module Discrete_Least_Squares_Meshless_Method; end
|
166
144
|
module Explained_Sum_Of_Squares; end
|
167
145
|
module Fraction_Of_Variance_Unexplained; end
|
@@ -192,12 +170,10 @@ module MovieDB
|
|
192
170
|
module Variable_Kernel_Density_Estimation; end
|
193
171
|
end
|
194
172
|
|
195
|
-
##
|
196
|
-
# primarily EDA is for seeing what the data can
|
197
|
-
# tell us beyond the formal modeling or hypothesis testing task
|
198
|
-
# The output will be a visual material
|
199
|
-
|
200
173
|
module ExploratoryDataAnalysis
|
174
|
+
# primarily EDA is for seeing what the data can
|
175
|
+
# tell us beyond the formal modeling or hypothesis testing task.
|
176
|
+
# The output will be a visual material.
|
201
177
|
module Data_Reduction; end
|
202
178
|
module Table_Diagonalization; end
|
203
179
|
module Configural_Frequency_Analysis; end
|
@@ -216,12 +192,12 @@ module MovieDB
|
|
216
192
|
module RegressionAnalysis
|
217
193
|
module Choice_Modelling; end
|
218
194
|
|
219
|
-
module Generalized_Linear_Model
|
220
|
-
module Binomial_Regression; end
|
221
|
-
module Generalized_Additive_Model; end
|
222
|
-
module Linear_Probability_Model; end
|
223
|
-
module Poisson_Regression; end
|
224
|
-
module Zero_Inflated_Model; end
|
195
|
+
module Generalized_Linear_Model
|
196
|
+
module Binomial_Regression; end
|
197
|
+
module Generalized_Additive_Model; end
|
198
|
+
module Linear_Probability_Model; end
|
199
|
+
module Poisson_Regression; end
|
200
|
+
module Zero_Inflated_Model; end
|
225
201
|
end
|
226
202
|
|
227
203
|
module Nonparametric_Regression; end
|
@@ -251,23 +227,19 @@ module MovieDB
|
|
251
227
|
end
|
252
228
|
end
|
253
229
|
|
254
|
-
##
|
255
|
-
#TODO: All Mathetical Calculations go here.
|
256
|
-
|
257
230
|
class ExportData
|
258
231
|
def write_spreadsheet (data, data_analysis_name)
|
259
|
-
|
260
232
|
begin data_analysis.is_a? String
|
261
233
|
@data_analysis_name = data_analysis_name.split.join.gsub('_', ' ').downcase.to_s
|
262
234
|
case data_analysis_name
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
235
|
+
when "coefficient of determination"
|
236
|
+
write_coefficient_of_determination
|
237
|
+
when "discrete least squares meshless method"
|
238
|
+
write_discrete_least_squares_meshless_method
|
239
|
+
when "discrete least squares meshless method"
|
240
|
+
write_discrete_least_squares_meshless_method
|
241
|
+
else
|
242
|
+
end
|
271
243
|
rescue
|
272
244
|
raise ArgumentError, 'invalid attribute'
|
273
245
|
end
|
@@ -275,12 +247,10 @@ module MovieDB
|
|
275
247
|
|
276
248
|
def write_coefficient_of_determination
|
277
249
|
book = Spreadsheet::Workbook.new
|
250
|
+
|
278
251
|
sheet1 = book.create_worksheet name: "Data Analysis: #{@data_analysis_name}"
|
279
252
|
sheet1.row(0).concat %w{title released_date worldwide_gross}
|
280
253
|
|
281
|
-
# Loop through the data to collect all values.
|
282
|
-
# Then values into array
|
283
|
-
|
284
254
|
data.each_with_index do |value, index|
|
285
255
|
sheet1[1, index] = "#{value}"
|
286
256
|
end
|
data/lib/movieDB/data_export.rb
CHANGED
@@ -1,13 +1,14 @@
|
|
1
1
|
require "spreadsheet"
|
2
2
|
require "MovieDB"
|
3
|
-
|
3
|
+
|
4
4
|
# This module will write xls document to file
|
5
5
|
#
|
6
|
-
# Usage
|
7
|
-
|
6
|
+
# Usage
|
7
|
+
#
|
8
|
+
# @book = Spreadsheet::Workbook.new
|
8
9
|
module MovieDB
|
9
10
|
class DataExport < MovieDB::Movie
|
10
|
-
class << self
|
11
|
+
class << self
|
11
12
|
#TODO: Check the data analysis(DA) name. Write a define_method and include the DA.
|
12
13
|
|
13
14
|
def export_movie_data
|
@@ -17,12 +18,11 @@ module MovieDB
|
|
17
18
|
end
|
18
19
|
|
19
20
|
def create_spreadsheet_file
|
20
|
-
directory_name =
|
21
|
+
directory_name = 'reports'
|
21
22
|
create_directory(directory_name)
|
22
23
|
Spreadsheet.client_encoding = 'UTF-8'
|
23
24
|
@book = Spreadsheet::Workbook.new
|
24
|
-
@sheet = @book.create_worksheet name: "Data Analysis: #{$DATA_ANALYSIS_NAME}" # the analysis
|
25
|
-
|
25
|
+
@sheet = @book.create_worksheet name: "Data Analysis: #{$DATA_ANALYSIS_NAME}" # the analysis name should be an input
|
26
26
|
end
|
27
27
|
|
28
28
|
def create_directory(directory_name)
|
@@ -37,26 +37,25 @@ module MovieDB
|
|
37
37
|
def create_spreadsheet_header
|
38
38
|
@sheet.row(0).concat $IMDB_ATTRIBUTES_HEADERS
|
39
39
|
|
40
|
-
title_format = Spreadsheet::Format.new :color => :blue,
|
41
|
-
:weight => :bold,
|
42
|
-
:size => 13
|
43
|
-
|
40
|
+
title_format = Spreadsheet::Format.new :color => :blue, :weight => :bold, :size => 13
|
44
41
|
float_format = Spreadsheet::Format.new :number_format => "0.00"
|
45
42
|
|
46
43
|
@sheet.row(0).default_format = title_format
|
44
|
+
|
47
45
|
@sheet.column(1).default_format = float_format
|
48
46
|
@sheet.column(16).default_format = float_format
|
49
47
|
@sheet.column(22).default_format = float_format
|
50
48
|
end
|
51
49
|
|
52
|
-
# Loop through array of and array imbd data. Each row has the
|
50
|
+
# Loop through array of and array imbd data. Each row has the
|
53
51
|
# the information about the film/movie
|
54
52
|
# The Data is obtained from MovieDB::Movie
|
55
53
|
# example
|
56
|
-
#
|
54
|
+
#
|
55
|
+
# Film: catching fire
|
57
56
|
def create_spreadsheet_body
|
58
|
-
|
59
|
-
|
57
|
+
$IMDB_ATTRIBUTES_HEADERS.each do |header|
|
58
|
+
case header
|
60
59
|
when 'title' then spreadsheet_body_text_data("title")
|
61
60
|
when 'cast_members' then spreadsheet_body_count_data("cast_members")
|
62
61
|
when 'cast_characters' then spreadsheet_body_count_data("cast_characters")
|
@@ -86,19 +85,19 @@ module MovieDB
|
|
86
85
|
end
|
87
86
|
|
88
87
|
def spreadsheet_body_text_data(header_title)
|
89
|
-
@e_t = element_title = MovieDB::Movie.instance_eval{filter_movie_attr(header_title)}.flatten
|
88
|
+
@e_t = element_title = MovieDB::Movie.instance_eval { filter_movie_attr(header_title) }.flatten
|
90
89
|
|
91
90
|
element_title.each_with_index do |element2, i|
|
92
91
|
element_array = element_title[(i)].split(' ',)
|
93
|
-
@sheet.row(1 + i).concat element_array
|
92
|
+
@sheet.row(1 + i).concat element_array
|
94
93
|
end
|
95
94
|
end
|
96
95
|
|
97
96
|
def spreadsheet_body_count_data(header_title)
|
98
|
-
|
97
|
+
element_cast = MovieDB::Movie.instance_eval { filter_movie_attr(header_title) }
|
99
98
|
|
100
|
-
|
101
|
-
|
99
|
+
0.upto(@e_t.length - 1) do |i|
|
100
|
+
element_array = []
|
102
101
|
|
103
102
|
element_array << element_cast[i].length
|
104
103
|
@sheet.row(1 + i).concat element_array
|
@@ -106,7 +105,7 @@ module MovieDB
|
|
106
105
|
end
|
107
106
|
|
108
107
|
def spreadsheet_body_numeric_data(header_title)
|
109
|
-
@e_t = element_title = MovieDB::Movie.instance_eval{filter_movie_attr(header_title)}
|
108
|
+
@e_t = element_title = MovieDB::Movie.instance_eval { filter_movie_attr(header_title) }
|
110
109
|
|
111
110
|
element_title.each_with_index do |element2, i|
|
112
111
|
element_array = element_title[(i)]
|
data/lib/movieDB/data_process.rb
CHANGED
@@ -5,6 +5,7 @@ module MovieDB
|
|
5
5
|
PATH_AOV = MovieDB::DataAnalysis::AnalysisOfVariance::LeastSquares
|
6
6
|
extend PATH_AOV::Statistic
|
7
7
|
extend PATH_AOV::Coefficient_Of_Determination
|
8
|
+
|
8
9
|
include PATH_AOV::Explained_Sum_Of_Squares
|
9
10
|
include PATH_AOV::Fraction_Of_Variance_Unexplained
|
10
11
|
include PATH_AOV::Gauss_Newton_Algorithm
|
data/lib/movieDB/genres/en.txt
CHANGED
data/lib/movieDB/person.rb
CHANGED
@@ -1,20 +1,19 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'time'
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
3
|
+
# Create an actor instance and return the values for the actor variable.
|
4
|
+
#
|
5
|
+
# actor = MovieDB::Actor.instance_eval{create_with_info("Brittany Murphy", "F", "1977-11-10", "2009-12-20")}
|
6
|
+
# actor = MovieDB::Actor.instance_eval{create_with_info("George Clooney", "M", "1961-05-06", nil)}
|
7
|
+
|
8
|
+
# Example to find the actor name:
|
9
|
+
#
|
10
|
+
# actor_name = actor.map(&:name) #=> ["Brittany Murphy"]
|
11
|
+
#
|
12
|
+
# Example to see if an actoyre is alive:
|
13
|
+
# actor_name = actor.map(&:alive?) #=> [false, true]
|
14
|
+
#
|
15
|
+
# Example to find an actor's age:
|
16
|
+
# actor_name = actor.map(&:age) #=> [32, 52]
|
18
17
|
module MovieDB
|
19
18
|
class Person
|
20
19
|
attr_accessor :name, :gender, :birth_date, :death_date, :birthplace
|
@@ -49,27 +48,28 @@ module MovieDB
|
|
49
48
|
person.gender = gender
|
50
49
|
person.birth_date = birth_date
|
51
50
|
person.death_date = death_date
|
51
|
+
|
52
52
|
return @person_DS << person
|
53
53
|
end
|
54
54
|
|
55
55
|
def filter_person(attr)
|
56
56
|
attr = attr.to_sym
|
57
|
-
raise ArgumentError
|
57
|
+
raise ArgumentError "#{attr} can only be name or age" if !attr == :age && :name
|
58
|
+
|
58
59
|
return @person_DS.select{|s| s.alive?}.map(&attr)
|
59
60
|
end
|
60
61
|
|
61
|
-
|
62
|
-
|
63
|
-
# rather than a float between min to max.(Ruby 2.0.0)
|
64
|
-
#
|
65
|
-
|
62
|
+
# Returns a random parameter integer between min to max,
|
63
|
+
# rather than a float between min to max.(Ruby 2.0.0)
|
66
64
|
def sample_attr(attr)
|
67
65
|
randgen = Object.new
|
68
66
|
attr_array = self.instance_eval{filter_person(attr)}
|
69
67
|
attr_array.sample(random: randgen)
|
70
68
|
end
|
71
69
|
end
|
70
|
+
|
72
71
|
private_class_method :create_with_info, :filter_person
|
72
|
+
|
73
73
|
end
|
74
74
|
|
75
75
|
class Actor < Person
|
@@ -85,7 +85,8 @@ module MovieDB
|
|
85
85
|
end
|
86
86
|
|
87
87
|
def actor_actress_gender(person)
|
88
|
-
case
|
88
|
+
case
|
89
|
+
when person.gender == 'F'
|
89
90
|
return "actress"
|
90
91
|
when person.gender == "M"
|
91
92
|
return "actor"
|
@@ -95,23 +96,22 @@ module MovieDB
|
|
95
96
|
end
|
96
97
|
|
97
98
|
class << self
|
98
|
-
|
99
99
|
def filter_actor_alive(attr)
|
100
100
|
attr = attr.to_sym
|
101
|
-
raise ArgumentError
|
101
|
+
raise ArgumentError "#{attr} can only be name or age" if !attr == :age && :name
|
102
|
+
|
102
103
|
return @person_DS.select{|s| s.alive?}.map(&"#{attr.to_sym}")
|
103
104
|
end
|
104
105
|
|
105
106
|
def filter_actor_deceased(actor)
|
106
|
-
return @person_DS.select{|s| !s.alive?}.map{|m| "#{m.age}"} if attr == "age"
|
107
|
-
return @person_DS.select{|s| !s.alive?}.map{|m| "#{m.name}"} if attr == "name"
|
107
|
+
return @person_DS.select{ |s| !s.alive?}.map{ |m| "#{m.age}" } if attr == "age"
|
108
|
+
return @person_DS.select{ |s| !s.alive?}.map{ |m| "#{m.name}" } if attr == "name"
|
108
109
|
end
|
109
110
|
|
110
111
|
end
|
111
112
|
end
|
112
113
|
|
113
114
|
class Writer < Person
|
114
|
-
|
115
115
|
attr_accessor :published_work
|
116
116
|
alias :published? :published_work
|
117
117
|
|
@@ -123,7 +123,6 @@ module MovieDB
|
|
123
123
|
end
|
124
124
|
|
125
125
|
class Director < Person
|
126
|
-
|
127
126
|
attr_accessor :filmography
|
128
127
|
|
129
128
|
def initialize(filmography = [])
|