movieDB 0.2.2 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/.gitignore +1 -0
- data/.travis.yml +30 -0
- data/README.md +80 -74
- data/lib/movieDB.rb +225 -268
- data/lib/movieDB/base.rb +4 -5
- data/lib/movieDB/data_analysis.rb +59 -89
- data/lib/movieDB/data_export.rb +20 -21
- data/lib/movieDB/data_process.rb +1 -0
- data/lib/movieDB/genres/en.txt +0 -1
- data/lib/movieDB/person.rb +27 -28
- data/lib/movieDB/status_checker.rb +30 -32
- data/lib/movieDB/version.rb +1 -1
- data/movieDB.gemspec +1 -1
- data/spec/data_process_spec.rb +1 -1
- data/spec/movieDB_spec.rb +5 -7
- data/spec/person_spec.rb +14 -14
- metadata +6 -28
- data/npm-debug.log +0 -0
data/lib/movieDB/base.rb
CHANGED
@@ -3,14 +3,13 @@ require 'MovieDB/status_checker'
|
|
3
3
|
require 'MovieDB/movie_error'
|
4
4
|
|
5
5
|
module MovieDB #:nodoc
|
6
|
-
# MoviesDB
|
7
|
-
# mathematical computations for analyzing film data from imdb.
|
8
|
-
#
|
9
|
-
|
6
|
+
# MoviesDB is not a datastore gem. Rather, it is a high-level statistical software that performs
|
7
|
+
# mathematical computations for analyzing film data from imdb.
|
8
|
+
# It is a solution to the common problem of deducing logical hypothesis based off movie data.
|
10
9
|
class Base
|
11
10
|
include StatusChecker
|
12
11
|
include MovieError
|
13
12
|
end
|
14
|
-
|
15
13
|
end
|
16
14
|
$:.unshift File.expand_path('..', __FILE__)
|
15
|
+
|
@@ -1,18 +1,15 @@
|
|
1
1
|
require 'MovieDB'
|
2
2
|
|
3
3
|
module MovieDB
|
4
|
-
|
5
|
-
|
6
|
-
# Analysing, inspecting, cleaning, transforming and modeling data.
|
7
|
-
|
4
|
+
# Analyzing, inspecting, cleaning, transforming and modeling data.
|
5
|
+
#
|
8
6
|
class DataAnalysis < MovieDB::Movie
|
9
7
|
module AnalysisOfVariance
|
10
8
|
module LeastSquares
|
11
9
|
module Statistic
|
12
|
-
|
13
10
|
def basic_statistic (directory_name)
|
14
11
|
open_spreadsheet(directory_name)
|
15
|
-
|
12
|
+
|
16
13
|
if check_imdb_count == true
|
17
14
|
puts "*"*41
|
18
15
|
puts "* A minimum of 2 Imdb id's are required *"
|
@@ -29,76 +26,58 @@ module MovieDB
|
|
29
26
|
@book = Spreadsheet.open File.join('reports', directory_name)
|
30
27
|
@sheet = @book.worksheet(0)
|
31
28
|
|
32
|
-
|
33
|
-
# Add document formatting
|
29
|
+
title_format = Spreadsheet::Format.new :color => :blue, :weight => :bold, :size => 13
|
34
30
|
|
35
|
-
|
36
|
-
:weight => :bold,
|
37
|
-
:size => 13
|
38
|
-
|
39
|
-
@sheet.column(22).width = "worldwide_gross".length
|
31
|
+
@sheet.column(22).width = "worldwide_gross".length
|
40
32
|
end
|
41
33
|
|
42
34
|
def check_imdb_count
|
43
35
|
@sheet.rows.count - 1 == 1
|
44
36
|
end
|
45
37
|
|
46
|
-
def perform_computation
|
47
|
-
|
38
|
+
def perform_computation
|
39
|
+
# Perform computation on the data collected.
|
40
|
+
# TODO: Need to use coefficienct statistical formula.
|
41
|
+
# Calculate median as an example but COD formula must be used.
|
42
|
+
# Mean is commonly called as average. Mean or Average is defined as the sum of
|
43
|
+
# all the given elements divided by the total number of elements.
|
44
|
+
#
|
45
|
+
# Range is the difference between the highest and the lowest values in a
|
46
|
+
# frequency distribution.
|
47
|
+
#
|
48
|
+
# Mode is the most frequently occurring value in a frequency distribution.
|
49
|
+
#
|
50
|
+
# Calculate Standard Deviation.
|
51
|
+
# Standard deviation is a statistical measure of spread or variability.
|
52
|
+
#
|
53
|
+
# The standard deviation is the root mean square (RMS) deviation of the
|
54
|
+
# values from their arithmetic mean.
|
48
55
|
total_columns = 22
|
49
56
|
@column = []
|
50
|
-
@row_count = @sheet.rows.count
|
51
57
|
|
52
|
-
|
53
|
-
# Use this total column count to make it dynamic
|
54
|
-
#total_columns = @column_count = @sheet.column_count
|
58
|
+
@row_count = @sheet.rows.count
|
55
59
|
|
56
60
|
1.upto(total_columns) do |c|
|
57
|
-
@column = []
|
58
|
-
|
59
|
-
##
|
60
|
-
# loop through to collect all elements
|
61
|
-
# The returned array includes both strings and integers elements
|
61
|
+
@column = []
|
62
62
|
|
63
63
|
@sheet.each_with_index do |row, i|
|
64
64
|
@column << @sheet[i, 0 + c ]
|
65
65
|
end
|
66
66
|
|
67
|
-
@column.shift
|
68
|
-
@column.compact!
|
69
|
-
row_count = @sheet.rows.count
|
70
|
-
|
71
|
-
##
|
72
|
-
# Perform computation on the data collected
|
73
|
-
# TODO: Need to use coefficienct statistical formula
|
74
|
-
# Calculate median as an example but COD formula must be used
|
67
|
+
@column.shift
|
68
|
+
@column.compact!
|
75
69
|
|
70
|
+
row_count = @sheet.rows.count
|
76
71
|
|
77
|
-
if @column.all? {|i| (1..99999999999).include? (i)}
|
78
|
-
|
72
|
+
if @column.all? { |i| (1..99999999999).include? (i) }
|
79
73
|
n = @column.count
|
80
74
|
@column.sort!
|
81
75
|
|
82
|
-
|
83
|
-
|
84
|
-
# all the given elements divided by the total number of elements.
|
85
|
-
#
|
86
|
-
# Range is the difference between the highest and the lowest values in a
|
87
|
-
# frequency distribution.
|
88
|
-
#
|
89
|
-
# Mode is the most frequently occurring value in a frequency distribution.
|
90
|
-
|
91
|
-
@mean = @column.sum/n # Find the mean
|
92
|
-
@range = @column.max - @column.min # Find the range
|
76
|
+
@mean = @column.sum / n
|
77
|
+
@range = @column.max - @column.min
|
93
78
|
|
94
79
|
freq = @column.inject(Hash.new(0)) { |h, v| h[v] += 1; h }
|
95
|
-
@mode = @column.sort_by { |v| freq[v]}.last # Find the mode
|
96
|
-
|
97
|
-
##
|
98
|
-
# Calculate Standard Deviation
|
99
|
-
# Standard deviation is a statistical measure of spread or variability.
|
100
|
-
# The standard deviation is the root mean square (RMS) deviation of the
|
101
|
-
# values from their arithmetic mean.
|
80
|
+
@mode = @column.sort_by { |v| freq[v] }.last # Find the mode
|
102
81
|
|
103
82
|
@column_squared = []
|
104
83
|
@column.each do |col|
|
@@ -107,15 +86,15 @@ module MovieDB
|
|
107
86
|
|
108
87
|
@sum_of_column = @column.sum
|
109
88
|
@sum_of_column_squared = @column_squared.sum
|
110
|
-
@standard_dev = Math.sqrt((@sum_of_column_squared -((@sum_of_column)*(@sum_of_column)/n))/(n-1))
|
89
|
+
@standard_dev = Math.sqrt((@sum_of_column_squared - ((@sum_of_column) * (@sum_of_column) / n)) / (n - 1))
|
111
90
|
|
112
91
|
if n.odd?
|
113
|
-
index = (n + 1)/2
|
114
|
-
@median = @column[index - 1]
|
92
|
+
index = (n + 1) / 2
|
93
|
+
@median = @column[index - 1]
|
115
94
|
else
|
116
|
-
middle_index = n/2
|
95
|
+
middle_index = n / 2
|
117
96
|
right_index = middle_index + 1
|
118
|
-
@median = (@column[middle_index - 1] + @column[right_index - 1])/2
|
97
|
+
@median = (@column[middle_index - 1] + @column[right_index - 1]) / 2
|
119
98
|
end
|
120
99
|
|
121
100
|
else
|
@@ -126,9 +105,6 @@ module MovieDB
|
|
126
105
|
@standard_dev = "N/A"
|
127
106
|
end
|
128
107
|
|
129
|
-
##
|
130
|
-
# Insert results into spreadsheet cell
|
131
|
-
|
132
108
|
@sheet[@row_count + 2, 0 ] = "Mean"
|
133
109
|
@sheet[@row_count + 2, 0 + c ] = @mean
|
134
110
|
|
@@ -143,7 +119,6 @@ module MovieDB
|
|
143
119
|
|
144
120
|
@sheet[@row_count + 6, 0 ] = "Standard Deviation"
|
145
121
|
@sheet[@row_count + 6, 0 + c ] = @standard_dev
|
146
|
-
|
147
122
|
end
|
148
123
|
end
|
149
124
|
|
@@ -155,13 +130,16 @@ module MovieDB
|
|
155
130
|
end
|
156
131
|
|
157
132
|
def insert_data_to_existing_xls_file
|
158
|
-
|
159
133
|
filename = ("#{report_name}.xls")
|
160
134
|
@book.write File.join('reports', filename)
|
161
135
|
return filename
|
162
136
|
end
|
163
137
|
end
|
164
|
-
|
138
|
+
|
139
|
+
module Coefficient_Of_Determination
|
140
|
+
# TODO: Add code.
|
141
|
+
end
|
142
|
+
|
165
143
|
module Discrete_Least_Squares_Meshless_Method; end
|
166
144
|
module Explained_Sum_Of_Squares; end
|
167
145
|
module Fraction_Of_Variance_Unexplained; end
|
@@ -192,12 +170,10 @@ module MovieDB
|
|
192
170
|
module Variable_Kernel_Density_Estimation; end
|
193
171
|
end
|
194
172
|
|
195
|
-
##
|
196
|
-
# primarily EDA is for seeing what the data can
|
197
|
-
# tell us beyond the formal modeling or hypothesis testing task
|
198
|
-
# The output will be a visual material
|
199
|
-
|
200
173
|
module ExploratoryDataAnalysis
|
174
|
+
# primarily EDA is for seeing what the data can
|
175
|
+
# tell us beyond the formal modeling or hypothesis testing task.
|
176
|
+
# The output will be a visual material.
|
201
177
|
module Data_Reduction; end
|
202
178
|
module Table_Diagonalization; end
|
203
179
|
module Configural_Frequency_Analysis; end
|
@@ -216,12 +192,12 @@ module MovieDB
|
|
216
192
|
module RegressionAnalysis
|
217
193
|
module Choice_Modelling; end
|
218
194
|
|
219
|
-
module Generalized_Linear_Model
|
220
|
-
module Binomial_Regression; end
|
221
|
-
module Generalized_Additive_Model; end
|
222
|
-
module Linear_Probability_Model; end
|
223
|
-
module Poisson_Regression; end
|
224
|
-
module Zero_Inflated_Model; end
|
195
|
+
module Generalized_Linear_Model
|
196
|
+
module Binomial_Regression; end
|
197
|
+
module Generalized_Additive_Model; end
|
198
|
+
module Linear_Probability_Model; end
|
199
|
+
module Poisson_Regression; end
|
200
|
+
module Zero_Inflated_Model; end
|
225
201
|
end
|
226
202
|
|
227
203
|
module Nonparametric_Regression; end
|
@@ -251,23 +227,19 @@ module MovieDB
|
|
251
227
|
end
|
252
228
|
end
|
253
229
|
|
254
|
-
##
|
255
|
-
#TODO: All Mathetical Calculations go here.
|
256
|
-
|
257
230
|
class ExportData
|
258
231
|
def write_spreadsheet (data, data_analysis_name)
|
259
|
-
|
260
232
|
begin data_analysis.is_a? String
|
261
233
|
@data_analysis_name = data_analysis_name.split.join.gsub('_', ' ').downcase.to_s
|
262
234
|
case data_analysis_name
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
235
|
+
when "coefficient of determination"
|
236
|
+
write_coefficient_of_determination
|
237
|
+
when "discrete least squares meshless method"
|
238
|
+
write_discrete_least_squares_meshless_method
|
239
|
+
when "discrete least squares meshless method"
|
240
|
+
write_discrete_least_squares_meshless_method
|
241
|
+
else
|
242
|
+
end
|
271
243
|
rescue
|
272
244
|
raise ArgumentError, 'invalid attribute'
|
273
245
|
end
|
@@ -275,12 +247,10 @@ module MovieDB
|
|
275
247
|
|
276
248
|
def write_coefficient_of_determination
|
277
249
|
book = Spreadsheet::Workbook.new
|
250
|
+
|
278
251
|
sheet1 = book.create_worksheet name: "Data Analysis: #{@data_analysis_name}"
|
279
252
|
sheet1.row(0).concat %w{title released_date worldwide_gross}
|
280
253
|
|
281
|
-
# Loop through the data to collect all values.
|
282
|
-
# Then values into array
|
283
|
-
|
284
254
|
data.each_with_index do |value, index|
|
285
255
|
sheet1[1, index] = "#{value}"
|
286
256
|
end
|
data/lib/movieDB/data_export.rb
CHANGED
@@ -1,13 +1,14 @@
|
|
1
1
|
require "spreadsheet"
|
2
2
|
require "MovieDB"
|
3
|
-
|
3
|
+
|
4
4
|
# This module will write xls document to file
|
5
5
|
#
|
6
|
-
# Usage
|
7
|
-
|
6
|
+
# Usage
|
7
|
+
#
|
8
|
+
# @book = Spreadsheet::Workbook.new
|
8
9
|
module MovieDB
|
9
10
|
class DataExport < MovieDB::Movie
|
10
|
-
class << self
|
11
|
+
class << self
|
11
12
|
#TODO: Check the data analysis(DA) name. Write a define_method and include the DA.
|
12
13
|
|
13
14
|
def export_movie_data
|
@@ -17,12 +18,11 @@ module MovieDB
|
|
17
18
|
end
|
18
19
|
|
19
20
|
def create_spreadsheet_file
|
20
|
-
directory_name =
|
21
|
+
directory_name = 'reports'
|
21
22
|
create_directory(directory_name)
|
22
23
|
Spreadsheet.client_encoding = 'UTF-8'
|
23
24
|
@book = Spreadsheet::Workbook.new
|
24
|
-
@sheet = @book.create_worksheet name: "Data Analysis: #{$DATA_ANALYSIS_NAME}" # the analysis
|
25
|
-
|
25
|
+
@sheet = @book.create_worksheet name: "Data Analysis: #{$DATA_ANALYSIS_NAME}" # the analysis name should be an input
|
26
26
|
end
|
27
27
|
|
28
28
|
def create_directory(directory_name)
|
@@ -37,26 +37,25 @@ module MovieDB
|
|
37
37
|
def create_spreadsheet_header
|
38
38
|
@sheet.row(0).concat $IMDB_ATTRIBUTES_HEADERS
|
39
39
|
|
40
|
-
title_format = Spreadsheet::Format.new :color => :blue,
|
41
|
-
:weight => :bold,
|
42
|
-
:size => 13
|
43
|
-
|
40
|
+
title_format = Spreadsheet::Format.new :color => :blue, :weight => :bold, :size => 13
|
44
41
|
float_format = Spreadsheet::Format.new :number_format => "0.00"
|
45
42
|
|
46
43
|
@sheet.row(0).default_format = title_format
|
44
|
+
|
47
45
|
@sheet.column(1).default_format = float_format
|
48
46
|
@sheet.column(16).default_format = float_format
|
49
47
|
@sheet.column(22).default_format = float_format
|
50
48
|
end
|
51
49
|
|
52
|
-
# Loop through array of and array imbd data. Each row has the
|
50
|
+
# Loop through array of and array imbd data. Each row has the
|
53
51
|
# the information about the film/movie
|
54
52
|
# The Data is obtained from MovieDB::Movie
|
55
53
|
# example
|
56
|
-
#
|
54
|
+
#
|
55
|
+
# Film: catching fire
|
57
56
|
def create_spreadsheet_body
|
58
|
-
|
59
|
-
|
57
|
+
$IMDB_ATTRIBUTES_HEADERS.each do |header|
|
58
|
+
case header
|
60
59
|
when 'title' then spreadsheet_body_text_data("title")
|
61
60
|
when 'cast_members' then spreadsheet_body_count_data("cast_members")
|
62
61
|
when 'cast_characters' then spreadsheet_body_count_data("cast_characters")
|
@@ -86,19 +85,19 @@ module MovieDB
|
|
86
85
|
end
|
87
86
|
|
88
87
|
def spreadsheet_body_text_data(header_title)
|
89
|
-
@e_t = element_title = MovieDB::Movie.instance_eval{filter_movie_attr(header_title)}.flatten
|
88
|
+
@e_t = element_title = MovieDB::Movie.instance_eval { filter_movie_attr(header_title) }.flatten
|
90
89
|
|
91
90
|
element_title.each_with_index do |element2, i|
|
92
91
|
element_array = element_title[(i)].split(' ',)
|
93
|
-
@sheet.row(1 + i).concat element_array
|
92
|
+
@sheet.row(1 + i).concat element_array
|
94
93
|
end
|
95
94
|
end
|
96
95
|
|
97
96
|
def spreadsheet_body_count_data(header_title)
|
98
|
-
|
97
|
+
element_cast = MovieDB::Movie.instance_eval { filter_movie_attr(header_title) }
|
99
98
|
|
100
|
-
|
101
|
-
|
99
|
+
0.upto(@e_t.length - 1) do |i|
|
100
|
+
element_array = []
|
102
101
|
|
103
102
|
element_array << element_cast[i].length
|
104
103
|
@sheet.row(1 + i).concat element_array
|
@@ -106,7 +105,7 @@ module MovieDB
|
|
106
105
|
end
|
107
106
|
|
108
107
|
def spreadsheet_body_numeric_data(header_title)
|
109
|
-
@e_t = element_title = MovieDB::Movie.instance_eval{filter_movie_attr(header_title)}
|
108
|
+
@e_t = element_title = MovieDB::Movie.instance_eval { filter_movie_attr(header_title) }
|
110
109
|
|
111
110
|
element_title.each_with_index do |element2, i|
|
112
111
|
element_array = element_title[(i)]
|
data/lib/movieDB/data_process.rb
CHANGED
@@ -5,6 +5,7 @@ module MovieDB
|
|
5
5
|
PATH_AOV = MovieDB::DataAnalysis::AnalysisOfVariance::LeastSquares
|
6
6
|
extend PATH_AOV::Statistic
|
7
7
|
extend PATH_AOV::Coefficient_Of_Determination
|
8
|
+
|
8
9
|
include PATH_AOV::Explained_Sum_Of_Squares
|
9
10
|
include PATH_AOV::Fraction_Of_Variance_Unexplained
|
10
11
|
include PATH_AOV::Gauss_Newton_Algorithm
|
data/lib/movieDB/genres/en.txt
CHANGED
data/lib/movieDB/person.rb
CHANGED
@@ -1,20 +1,19 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'time'
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
3
|
+
# Create an actor instance and return the values for the actor variable.
|
4
|
+
#
|
5
|
+
# actor = MovieDB::Actor.instance_eval{create_with_info("Brittany Murphy", "F", "1977-11-10", "2009-12-20")}
|
6
|
+
# actor = MovieDB::Actor.instance_eval{create_with_info("George Clooney", "M", "1961-05-06", nil)}
|
7
|
+
|
8
|
+
# Example to find the actor name:
|
9
|
+
#
|
10
|
+
# actor_name = actor.map(&:name) #=> ["Brittany Murphy"]
|
11
|
+
#
|
12
|
+
# Example to see if an actoyre is alive:
|
13
|
+
# actor_name = actor.map(&:alive?) #=> [false, true]
|
14
|
+
#
|
15
|
+
# Example to find an actor's age:
|
16
|
+
# actor_name = actor.map(&:age) #=> [32, 52]
|
18
17
|
module MovieDB
|
19
18
|
class Person
|
20
19
|
attr_accessor :name, :gender, :birth_date, :death_date, :birthplace
|
@@ -49,27 +48,28 @@ module MovieDB
|
|
49
48
|
person.gender = gender
|
50
49
|
person.birth_date = birth_date
|
51
50
|
person.death_date = death_date
|
51
|
+
|
52
52
|
return @person_DS << person
|
53
53
|
end
|
54
54
|
|
55
55
|
def filter_person(attr)
|
56
56
|
attr = attr.to_sym
|
57
|
-
raise ArgumentError
|
57
|
+
raise ArgumentError "#{attr} can only be name or age" if !attr == :age && :name
|
58
|
+
|
58
59
|
return @person_DS.select{|s| s.alive?}.map(&attr)
|
59
60
|
end
|
60
61
|
|
61
|
-
|
62
|
-
|
63
|
-
# rather than a float between min to max.(Ruby 2.0.0)
|
64
|
-
#
|
65
|
-
|
62
|
+
# Returns a random parameter integer between min to max,
|
63
|
+
# rather than a float between min to max.(Ruby 2.0.0)
|
66
64
|
def sample_attr(attr)
|
67
65
|
randgen = Object.new
|
68
66
|
attr_array = self.instance_eval{filter_person(attr)}
|
69
67
|
attr_array.sample(random: randgen)
|
70
68
|
end
|
71
69
|
end
|
70
|
+
|
72
71
|
private_class_method :create_with_info, :filter_person
|
72
|
+
|
73
73
|
end
|
74
74
|
|
75
75
|
class Actor < Person
|
@@ -85,7 +85,8 @@ module MovieDB
|
|
85
85
|
end
|
86
86
|
|
87
87
|
def actor_actress_gender(person)
|
88
|
-
case
|
88
|
+
case
|
89
|
+
when person.gender == 'F'
|
89
90
|
return "actress"
|
90
91
|
when person.gender == "M"
|
91
92
|
return "actor"
|
@@ -95,23 +96,22 @@ module MovieDB
|
|
95
96
|
end
|
96
97
|
|
97
98
|
class << self
|
98
|
-
|
99
99
|
def filter_actor_alive(attr)
|
100
100
|
attr = attr.to_sym
|
101
|
-
raise ArgumentError
|
101
|
+
raise ArgumentError "#{attr} can only be name or age" if !attr == :age && :name
|
102
|
+
|
102
103
|
return @person_DS.select{|s| s.alive?}.map(&"#{attr.to_sym}")
|
103
104
|
end
|
104
105
|
|
105
106
|
def filter_actor_deceased(actor)
|
106
|
-
return @person_DS.select{|s| !s.alive?}.map{|m| "#{m.age}"} if attr == "age"
|
107
|
-
return @person_DS.select{|s| !s.alive?}.map{|m| "#{m.name}"} if attr == "name"
|
107
|
+
return @person_DS.select{ |s| !s.alive?}.map{ |m| "#{m.age}" } if attr == "age"
|
108
|
+
return @person_DS.select{ |s| !s.alive?}.map{ |m| "#{m.name}" } if attr == "name"
|
108
109
|
end
|
109
110
|
|
110
111
|
end
|
111
112
|
end
|
112
113
|
|
113
114
|
class Writer < Person
|
114
|
-
|
115
115
|
attr_accessor :published_work
|
116
116
|
alias :published? :published_work
|
117
117
|
|
@@ -123,7 +123,6 @@ module MovieDB
|
|
123
123
|
end
|
124
124
|
|
125
125
|
class Director < Person
|
126
|
-
|
127
126
|
attr_accessor :filmography
|
128
127
|
|
129
128
|
def initialize(filmography = [])
|