movieDB 0.3.4 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,263 +0,0 @@
1
- require 'MovieDB'
2
-
3
- module MovieDB
4
- # Analyzing, inspecting, cleaning, transforming and modeling data.
5
- #
6
- class DataAnalysis < MovieDB::Movie
7
- module AnalysisOfVariance
8
- module LeastSquares
9
- module Statistic
10
- def basic_statistic(directory_name)
11
- open_spreadsheet(directory_name)
12
- @directory_name = directory_name
13
-
14
- if check_imdb_count == true
15
- puts "*"*41
16
- puts "* A minimum of 2 Imdb id's are required *"
17
- puts "* To perform statistical data analysis *"
18
- puts "* You only have ONE Imdb id entered *"
19
- puts "*"*41
20
- else
21
- perform_computation
22
- insert_data_to_existing_xls_file
23
- end
24
- end
25
-
26
- def open_spreadsheet(directory_name)
27
- @book = Spreadsheet.open File.join('reports', directory_name)
28
- @sheet = @book.worksheet(0)
29
-
30
- title_format = Spreadsheet::Format.new :color => :blue, :weight => :bold, :size => 13
31
-
32
- @sheet.column(22).width = "worldwide_gross".length
33
- end
34
-
35
- def check_imdb_count
36
- @sheet.rows.count - 1 == 1
37
- end
38
-
39
- def perform_computation
40
- # Perform computation on the data collected.
41
- #
42
- # TODO: Need to use coefficienct statistical formula.
43
- #
44
- # Calculate median as an example but COD formula must be used.
45
- # Mean is commonly called as average. Mean or Average is defined as the sum of
46
- # all the given elements divided by the total number of elements.
47
- #
48
- # Range is the difference between the highest and the lowest values in a
49
- # frequency distribution.
50
- #
51
- # Mode is the most frequently occurring value in a frequency distribution.
52
- #
53
- # Calculate Standard Deviation.
54
- # Standard deviation is a statistical measure of spread or variability.
55
- #
56
- # The standard deviation is the root mean square (RMS) deviation of the
57
- # values from their arithmetic mean.
58
- total_columns = 22
59
- @column = []
60
-
61
- @row_count = @sheet.rows.count
62
-
63
- 1.upto(total_columns) do |c|
64
- @column = []
65
-
66
- @sheet.each_with_index do |row, i|
67
- @column << @sheet[i, 0 + c ]
68
- end
69
-
70
- @column.shift
71
- @column.compact!
72
-
73
- row_count = @sheet.rows.count
74
-
75
- if @column.all? { |i| (1..99999999999).include? (i) }
76
- n = @column.count
77
- @column.sort!
78
-
79
- @mean = @column.sum / n
80
- @range = @column.max - @column.min
81
-
82
- freq = @column.inject(Hash.new(0)) { |h, v| h[v] += 1; h }
83
- @mode = @column.sort_by { |v| freq[v] }.last # Find the mode
84
-
85
- @column_squared = []
86
- @column.each do |col|
87
- @column_squared << col**2
88
- end
89
-
90
- @sum_of_column = @column.sum
91
- @sum_of_column_squared = @column_squared.sum
92
- @standard_dev = Math.sqrt((@sum_of_column_squared - ((@sum_of_column) * (@sum_of_column) / n)) / (n - 1))
93
-
94
- if n.odd?
95
- index = (n + 1) / 2
96
- @median = @column[index - 1]
97
- else
98
- middle_index = n / 2
99
- right_index = middle_index + 1
100
- @median = (@column[middle_index - 1] + @column[right_index - 1]) / 2
101
- end
102
-
103
- else
104
- @median = "N/A"
105
- @mean = "N/A"
106
- @range = "N/A"
107
- @mode = "N/A"
108
- @standard_dev = "N/A"
109
- end
110
-
111
- @sheet[@row_count + 2, 0 ] = "Mean"
112
- @sheet[@row_count + 2, 0 + c ] = @mean
113
-
114
- @sheet[@row_count + 3, 0 ] = "Median"
115
- @sheet[@row_count + 3, 0 + c ] = @median
116
-
117
- @sheet[@row_count + 4, 0 ] = "Range"
118
- @sheet[@row_count + 4, 0 + c ] = @range
119
-
120
- @sheet[@row_count + 5, 0 ] = "Mode"
121
- @sheet[@row_count + 5, 0 + c ] = @mode
122
-
123
- @sheet[@row_count + 6, 0 ] = "Standard Deviation"
124
- @sheet[@row_count + 6, 0 + c ] = @standard_dev
125
- end
126
- end
127
-
128
- def report_name
129
- module_nesting = Module.nesting[0].to_s.gsub('::', ' ').split()
130
- count = module_nesting.size
131
- @data_analysis_name = module_nesting[count - 1]
132
- @data_analysis_name << '_' << @directory_name.gsub('_.xls', '')
133
- end
134
-
135
- def insert_data_to_existing_xls_file
136
- filename = ("#{report_name}.xls")
137
- @book.write File.join('reports', filename)
138
- return filename
139
- end
140
- end
141
-
142
- module Coefficient_Of_Determination
143
- # TODO: Add code.
144
- end
145
-
146
- module Discrete_Least_Squares_Meshless_Method; end
147
- module Explained_Sum_Of_Squares; end
148
- module Fraction_Of_Variance_Unexplained; end
149
- module Gauss_Newton_Algorithm; end
150
- module Iteratively_Reweighted_Least_Squares; end
151
- module Lack_Of_Fit_Sum_Of_Squares; end
152
- module Least_Squares_Support_Vector_Machine; end
153
- module Mean_Squared_Error; end
154
- module Moving_Least_Sqares; end
155
- module Non_Linear_Iterative_Partial_Least_Squares; end
156
- module Non_Linear_Least_Squares; end
157
- module Ordinary_Least_Squares; end
158
- module Partial_Least_Squares_Regression; end
159
- module Partition_Of_Sums_Of_Squares; end
160
- module Proofs_Involving_Ordinary_Least_Squares; end
161
- module Residual_Sum_Of_Squares; end
162
- module Total_Least_Squares; end
163
- module Total_Sum_Of_Squares; end
164
- end
165
- end
166
-
167
- module EstimationOfDensity
168
- module Cluster_Weighted_Modeling; end
169
- module Density_Estimation; end
170
- module Discretization_Of_Continuous_Features; end
171
- module Mean_Integrated_Squared_Error; end
172
- module Multivariate_Kernel_Density_Estimation; end
173
- module Variable_Kernel_Density_Estimation; end
174
- end
175
-
176
- module ExploratoryDataAnalysis
177
- # primarily EDA is for seeing what the data can
178
- # tell us beyond the formal modeling or hypothesis testing task.
179
- # The output will be a visual material.
180
- module Data_Reduction; end
181
- module Table_Diagonalization; end
182
- module Configural_Frequency_Analysis; end
183
- module Median_Polish; end
184
- module Stem_And_Leaf_Display; end
185
- end
186
-
187
-
188
- module Data_Mining
189
- module Applied_DataMining; end
190
- module Cluster_Analysis; end
191
- module Dimension_Reduction; end
192
- module Applied_DataMining; end
193
- end
194
-
195
- module RegressionAnalysis
196
- module Choice_Modelling; end
197
-
198
- module Generalized_Linear_Model
199
- module Binomial_Regression; end
200
- module Generalized_Additive_Model; end
201
- module Linear_Probability_Model; end
202
- module Poisson_Regression; end
203
- module Zero_Inflated_Model; end
204
- end
205
-
206
- module Nonparametric_Regression; end
207
- module Statistical_Outliers; end
208
- module Regression_And_Curve_Fitting_Software; end
209
- module Regression_Diagnostics; end
210
- module Regression_Variable_Selection; end
211
- module Regression_With_Time_Series_Structure; end
212
- module Robust_Regression; end
213
- module Choice_Modeling; end
214
- end
215
-
216
- module Resampling
217
- module Bootstrapping_Population; end
218
- end
219
-
220
- module Sensitivity_Analysis
221
- module Variance_Based_Sensitivity_Analysis; end
222
- module Elementary_Effects_Method; end
223
- module Experimental_Uncertainty_Analysis; end
224
- module Fourier_Amplitude_Sensitivity_Testing; end
225
- module Hyperparameter; end
226
- end
227
-
228
- module Time_series_Analysis
229
- module Frequency_Deviation; end
230
- end
231
- end
232
-
233
- class ExportData
234
- def write_spreadsheet(data, data_analysis_name)
235
- begin data_analysis.is_a? String
236
- @data_analysis_name = data_analysis_name.split.join.gsub('_', ' ').downcase.to_s
237
- case data_analysis_name
238
- when "coefficient of determination"
239
- write_coefficient_of_determination
240
- when "discrete least squares meshless method"
241
- write_discrete_least_squares_meshless_method
242
- when "discrete least squares meshless method"
243
- write_discrete_least_squares_meshless_method
244
- else
245
- end
246
- rescue
247
- raise ArgumentError, 'invalid attribute'
248
- end
249
- end
250
-
251
- def write_coefficient_of_determination
252
- book = Spreadsheet::Workbook.new
253
-
254
- sheet1 = book.create_worksheet name: "Data Analysis: #{@data_analysis_name}"
255
- sheet1.row(0).concat %w{title released_date worldwide_gross}
256
-
257
- data.each_with_index do |value, index|
258
- sheet1[1, index] = "#{value}"
259
- end
260
- end
261
-
262
- end
263
- end
@@ -1,96 +0,0 @@
1
- require "spreadsheet"
2
- require "redis"
3
- require "json"
4
-
5
- # Movie data fetched from IMDb is stored as a hash data type in redis.
6
- # The key and values are written into a spreadsheet for later data analysis.
7
- module MovieDB
8
- module DataExport
9
- IMDB_ATTRIBUTES_HEADERS = %w(title cast_members cast_characters cast_member_ids cast_members_characters
10
- trailer_url director writers filming_locations company genres languages countries
11
- length plot poster rating votes mpaa_rating tagline year release_date revenue)
12
-
13
- def export_movie_data(db_redis, imdb_ids)
14
-
15
- @db_redis = db_redis
16
- @imdb_ids = imdb_ids
17
-
18
- create_spreadsheet_file
19
- create_spreadsheet_report
20
-
21
- write_xls_file
22
- end
23
-
24
- def create_spreadsheet_file
25
- directory_name = 'reports'
26
- create_directory(directory_name)
27
-
28
- Spreadsheet.client_encoding = 'UTF-8'
29
-
30
- @book = Spreadsheet::Workbook.new
31
- @sheet = @book.create_worksheet
32
- @sheet.name = report_name if @db_redis
33
- @sheet.name = "Data Analysis: #{$DATA_ANALYSIS_NAME}" if $DATA_ANALYSIS_NAME
34
- end
35
-
36
- def create_directory(directory_name)
37
- Dir.mkdir(directory_name) unless File.exists? directory_name
38
- end
39
-
40
- def create_spreadsheet_report
41
- create_spreadsheet_header
42
- create_spreadsheet_body
43
- end
44
-
45
- def create_spreadsheet_header
46
- @sheet.row(0).concat MovieDB::DataExport::IMDB_ATTRIBUTES_HEADERS
47
-
48
- title_format = Spreadsheet::Format.new :color => :blue, :weight => :bold, :size => 13
49
- float_format = Spreadsheet::Format.new :number_format => "0.00"
50
-
51
- @sheet.row(0).default_format = title_format
52
- @sheet.column(1).default_format = float_format
53
- @sheet.column(16).default_format = float_format
54
- @sheet.column(22).default_format = float_format
55
- end
56
-
57
- # We write the all keys and values from our data set to the spreadsheet
58
- def create_spreadsheet_body
59
- @imdb_ids.each_with_index do |imdb_id, idx|
60
- row = @sheet.row(idx + 1)
61
-
62
- MovieDB::DataExport::IMDB_ATTRIBUTES_HEADERS.each do |attr_key|
63
- string_values = ['title', 'language', 'length', 'rating', 'vote', 'release', 'mpaa_rating', 'year', 'revenue']
64
-
65
- # Check to see if the fetch redis value is in a JSON
66
- begin
67
- movie_value = JSON.parse(@db_redis.hget "movie:#{imdb_id}", "#{attr_key}")
68
- rescue => e
69
- movie_value = [] << (@db_redis.hget "movie:#{imdb_id}", "#{attr_key}")
70
- end
71
-
72
- row.push(movie_value.map { |t| t }.join(' ')) if ([].unshift attr_key).any? { |v| string_values.include?(v) }
73
- row.push movie_value.length if (movie_value.is_a? Array) && ([].unshift attr_key).any? { |v| !string_values.include?(v) }
74
- row.push(movie_value) if movie_value.is_a? String
75
- end
76
- end
77
- end
78
-
79
- def report_name
80
- name = "imdb_"
81
-
82
- @imdb_ids.each do |imdb_id|
83
- name << (@db_redis.hget "movie:#{imdb_id}", "title").gsub(' ', '')
84
- name << '_' unless @imdb_ids.length == imdb_id
85
- end
86
-
87
- return name
88
- end
89
-
90
- def write_xls_file
91
- filename = ("#{report_name}.xls")
92
- @book.write File.join('reports', filename)
93
- return filename
94
- end
95
- end
96
- end
@@ -1,26 +0,0 @@
1
- require 'MovieDB/data_analysis'
2
-
3
- module MovieDB
4
- class DataProcess
5
- PATH_AOV = MovieDB::DataAnalysis::AnalysisOfVariance::LeastSquares
6
- extend PATH_AOV::Statistic
7
- extend PATH_AOV::Coefficient_Of_Determination
8
-
9
- include PATH_AOV::Explained_Sum_Of_Squares
10
- include PATH_AOV::Fraction_Of_Variance_Unexplained
11
- include PATH_AOV::Gauss_Newton_Algorithm
12
- include PATH_AOV::Iteratively_Reweighted_Least_Squares
13
- include PATH_AOV::Lack_Of_Fit_Sum_Of_Squares
14
- include PATH_AOV::Least_Squares_Support_Vector_Machine
15
- include PATH_AOV::Mean_Squared_Error
16
- include PATH_AOV::Non_Linear_Iterative_Partial_Least_Squares
17
- include PATH_AOV::Non_Linear_Least_Squares
18
- include PATH_AOV::Ordinary_Least_Squares
19
- include PATH_AOV::Partial_Least_Squares_Regression
20
- include PATH_AOV::Partition_Of_Sums_Of_Squares
21
- include PATH_AOV::Residual_Sum_Of_Squares
22
- include PATH_AOV::Total_Least_Squares
23
- include PATH_AOV::Total_Sum_Of_Squares
24
- end
25
- end
26
-
@@ -1,20 +0,0 @@
1
- ##
2
- #TODO: Re-word the responses to be human readable.
3
-
4
- module MovieDB
5
- module MovieError
6
- def raise_errors(response)
7
- case response.to_i
8
- when 200
9
- raise OK, "(#{response}: Successful )"
10
- when 404
11
- raise NotFound, "(#{response}: Resource Not found)"
12
- when 500
13
- raise Lookup, "(#{response}: Internal Server Error.)"
14
- when 503
15
- raise Unavailable, "(#{response}: Resource is Unavailable.)"
16
- else
17
- end
18
- end
19
- end
20
- end
@@ -1,48 +0,0 @@
1
- module MovieDB
2
- module StatusChecker
3
- # Check the film release and updates the status.
4
- #
5
- # Example of checking for status:
6
- #
7
- # movie = Movie.new(film_release: ['theatrical', 'print'])
8
- # movie.status_check
9
- def self.included(base)
10
- base.class_eval {
11
- def theatrical_released?
12
- self.movie_status == 'theartrical'
13
- end
14
-
15
- def video_released?
16
- self.movie_status == 'video'
17
- end
18
-
19
- def television_released?
20
- self.movie_status == 'television'
21
- end
22
-
23
- def internet_released?
24
- self.movie_status == 'internet'
25
- end
26
-
27
- def print_released?
28
- self.movie_status == 'print'
29
- end
30
-
31
- def status_check
32
- case
33
- when self.theatrical_released? && self.television_released? && self.video_released? && self.print_released?
34
- "Wide Release"
35
- when self.theatrical_released? && self.print_released?
36
- "Modified Wide Release"
37
- when self.theatrical_released? && (self.internet_released? || self.print_released?)
38
- "Exclusive and Limited Runs"
39
- when self.theatrical_released? || self.television_released? || self.video_released? || self.print_released?
40
- "Territorial Saturation"
41
- else
42
- "Not Released"
43
- end
44
- end
45
- }
46
- end
47
- end
48
- end