movieDB 0.1.10 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +91 -2
- data/lib/movieDB/data_analysis.rb +95 -20
- data/lib/movieDB/data_process.rb +1 -0
- data/lib/movieDB/version.rb +1 -1
- data/reports/{Coefficient_Of_Determination_2013121918.xls → Statistic_2013122017.xls} +0 -0
- data/spec/data_process_spec.rb +2 -3
- metadata +3 -5
- data/reports/add.xls +0 -0
- data/reports/imdb_raw_data_2013121912.xls +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4d6f86e45dd6df3973448eb08ac5e76258be9fe6
|
4
|
+
data.tar.gz: 740512f4b3c575ca884148584b24eaae6b62ea05
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8416c049afe77426e2fd6daf29b91e0f60528cf24309effd09d072c10330a9f1d5fa27bf8d59eee2029b4be805ccf25d5253c4205dad1c485c738fce253698c1
|
7
|
+
data.tar.gz: 23f5c489154e0c4ce12452056cd4a5a5465de77df80fa26a21e74bfb9eefc85bbfb09922f6bb728411737016bed2d44980ba93bdede2c7929daf782981ce0a79
|
data/README.md
CHANGED
@@ -47,10 +47,99 @@ The exported movie data is stored in your reports directory.
|
|
47
47
|
|
48
48
|
$ cd /reports/imdb_raw_data_20131216.xls
|
49
49
|
|
50
|
-
## Usage - Data
|
50
|
+
## Usage - Analysing Data and Generating Stats
|
51
51
|
|
52
52
|
$ irb
|
53
53
|
|
54
54
|
> require 'MovieDB/data_analysis'
|
55
55
|
|
56
|
-
>
|
56
|
+
> require 'MovieDB/data_process'
|
57
|
+
|
58
|
+
> MovieDB::DataProcess.send(:basic_statistics, 'imdb_raw_data_20131216.xls')
|
59
|
+
|
60
|
+
## Exported - Analyzed Data
|
61
|
+
|
62
|
+
The exported analyzed data is stored in your reports directory.
|
63
|
+
|
64
|
+
$ cd /reports/basic_statistic_20131216.xls
|
65
|
+
|
66
|
+
## What's Next
|
67
|
+
|
68
|
+
More statistical computations coming. This includes:
|
69
|
+
|
70
|
+
> Gauss_Newton_Algorithm
|
71
|
+
> Iteratively_Reweighted_Least_Squares
|
72
|
+
> Lack_Of_Fit_Sum_Of_Squares
|
73
|
+
> Least_Squares_Support_Vector_Machine
|
74
|
+
> Mean_Squared_Error
|
75
|
+
> Moving_Least_Sqares
|
76
|
+
> Non_Linear_Iterative_Partial_Least_Squares
|
77
|
+
> Non_Linear_Least_Squares
|
78
|
+
> Ordinary_Least_Squares
|
79
|
+
> Partial_Least_Squares_Regression
|
80
|
+
> Partition_Of_Sums_Of_Squares
|
81
|
+
> Proofs_Involving_Ordinary_Least_Squares
|
82
|
+
> Residual_Sum_Of_Squares
|
83
|
+
> Total_Least_Squares
|
84
|
+
> Total_Sum_Of_Squares
|
85
|
+
|
86
|
+
> EstimationOfDensity
|
87
|
+
> Cluster_Weighted_Modeling
|
88
|
+
> Density_Estimation
|
89
|
+
> Discretization_Of_Continuous_Features
|
90
|
+
> Mean_Integrated_Squared_Error
|
91
|
+
> Multivariate_Kernel_Density_Estimation
|
92
|
+
> Variable_Kernel_Density_Estimation
|
93
|
+
|
94
|
+
> ExploratoryDataAnalysis
|
95
|
+
> Data_Reduction
|
96
|
+
> Table_Diagonalization
|
97
|
+
> Configural_Frequency_Analysis
|
98
|
+
> Median_Polish
|
99
|
+
> Stem_And_Leaf_Display
|
100
|
+
|
101
|
+
> Data_Mining
|
102
|
+
> Applied_DataMining
|
103
|
+
> Cluster_Analysis
|
104
|
+
> Dimension_Reduction
|
105
|
+
> Applied_DataMining
|
106
|
+
|
107
|
+
> RegressionAnalysis
|
108
|
+
> Choice_Modelling
|
109
|
+
|
110
|
+
> Generalized_Linear_Model
|
111
|
+
> Binomial_Regression
|
112
|
+
> Generalized_Additive_Model
|
113
|
+
> Linear_Probability_Model
|
114
|
+
> Poisson_Regression
|
115
|
+
> Zero_Inflated_Model
|
116
|
+
|
117
|
+
> Nonparametric_Regression
|
118
|
+
> Statistical_Outliers
|
119
|
+
> Regression_And_Curve_Fitting_Software
|
120
|
+
> Regression_Diagnostics
|
121
|
+
> Regression_Variable_Selection
|
122
|
+
> Regression_With_Time_Series_Structure
|
123
|
+
> Robust_Regression
|
124
|
+
> Choice_Modeling
|
125
|
+
|
126
|
+
> Resampling
|
127
|
+
> Bootstrapping_Population
|
128
|
+
|
129
|
+
> Sensitivity_Analysis
|
130
|
+
> Variance_Based_Sensitivity_Analysis
|
131
|
+
> Elementary_Effects_Method
|
132
|
+
> Experimental_Uncertainty_Analysis
|
133
|
+
> Fourier_Amplitude_Sensitivity_Testing
|
134
|
+
> Hyperparameter
|
135
|
+
|
136
|
+
> Time_series_Analysis
|
137
|
+
> Frequency_Deviation
|
138
|
+
|
139
|
+
## Contact me
|
140
|
+
|
141
|
+
If you'd like to collaborate, please feel free to fork source code on github.
|
142
|
+
|
143
|
+
Also, You can also contact me at albertmck@gmail.com
|
144
|
+
|
145
|
+
|
@@ -8,8 +8,9 @@ module MovieDB
|
|
8
8
|
class DataAnalysis < MovieDB::Movie
|
9
9
|
module AnalysisOfVariance
|
10
10
|
module LeastSquares
|
11
|
-
module
|
12
|
-
|
11
|
+
module Statistic
|
12
|
+
|
13
|
+
def basic_statistic (directory_name)
|
13
14
|
open_spreadsheet(directory_name)
|
14
15
|
perform_computation
|
15
16
|
insert_data_to_existing_xls_file
|
@@ -21,25 +22,102 @@ module MovieDB
|
|
21
22
|
end
|
22
23
|
|
23
24
|
def perform_computation
|
24
|
-
@col_0 = []
|
25
25
|
|
26
|
-
|
27
|
-
|
28
|
-
|
26
|
+
total_columns = 17
|
27
|
+
@column = []
|
28
|
+
@row_count = @sheet.rows.count
|
29
|
+
|
30
|
+
1.upto(total_columns) do |c|
|
31
|
+
@column = [] # set instance variable to an empty array
|
32
|
+
|
33
|
+
##
|
34
|
+
# loop through to collect all elements
|
35
|
+
# The returned array includes both strings and integers elements
|
36
|
+
|
37
|
+
@sheet.each_with_index do |row, i|
|
38
|
+
@column << @sheet[i, 0 + c ]
|
39
|
+
end
|
40
|
+
|
41
|
+
@column.shift # delete the string header from the array
|
42
|
+
@column.compact! # delete nil from the array
|
43
|
+
row_count = @sheet.rows.count
|
44
|
+
|
45
|
+
##
|
46
|
+
# Perform computation on the data collected
|
47
|
+
# TODO: Need to use coefficienct statistical formula
|
48
|
+
# Calculate median as an example but COD formula must be used
|
49
|
+
|
50
|
+
|
51
|
+
if @column.all? {|i| (1..99999999).include? (i)}
|
52
|
+
|
53
|
+
n = @column.count
|
54
|
+
@column.sort!
|
55
|
+
|
56
|
+
##
|
57
|
+
# Mean is commonly called as average.Mean or Average is defined as the sum of
|
58
|
+
# all the given elements divided by the total number of elements.
|
59
|
+
#
|
60
|
+
# Range is the difference between the highest and the lowest values in a
|
61
|
+
# frequency distribution.
|
62
|
+
#
|
63
|
+
# Mode is the most frequently occurring value in a frequency distribution.
|
29
64
|
|
30
|
-
|
31
|
-
|
32
|
-
# TODO: Need to use coefficienct statistical formula
|
33
|
-
# Calculate median as an example but COD formula must be used
|
65
|
+
@mean = @column.sum/n # Find the mean
|
66
|
+
@range = @column.max - @column.min # Find the range
|
34
67
|
|
35
|
-
|
68
|
+
freq = @column.inject(Hash.new(0)) { |h, v| h[v] += 1; h }
|
69
|
+
@mode = @column.sort_by { |v| freq[v]}.last # Find the mode
|
36
70
|
|
37
|
-
|
38
|
-
|
71
|
+
##
|
72
|
+
# Calculate Standard Deviation
|
73
|
+
# Standard deviation is a statistical measure of spread or variability.
|
74
|
+
# The standard deviation is the root mean square (RMS) deviation of the
|
75
|
+
# values from their arithmetic mean.
|
39
76
|
|
40
|
-
|
41
|
-
|
42
|
-
|
77
|
+
@column_squared = []
|
78
|
+
@column.each do |col|
|
79
|
+
@column_squared << col**2
|
80
|
+
end
|
81
|
+
|
82
|
+
@sum_of_column = @column.sum
|
83
|
+
@sum_of_column_squared = @column_squared.sum
|
84
|
+
@standard_dev = Math.sqrt((@sum_of_column_squared -((@sum_of_column)*(@sum_of_column)/n))/(n-1))
|
85
|
+
|
86
|
+
if n.odd?
|
87
|
+
index = (n + 1)/2
|
88
|
+
@median = @column[index - 1] # Subtract -1 to reduce index value since array start with an index 0.
|
89
|
+
else
|
90
|
+
middle_index = n/2
|
91
|
+
right_index = middle_index + 1
|
92
|
+
@median = (@column[middle_index - 1] + @column[right_index - 1])/2
|
93
|
+
end
|
94
|
+
|
95
|
+
else
|
96
|
+
@median = "N/A"
|
97
|
+
@mean = "N/A"
|
98
|
+
@range = "N/A"
|
99
|
+
@mode = "N/A"
|
100
|
+
@standard_dev = "N/A"
|
101
|
+
end
|
102
|
+
|
103
|
+
##
|
104
|
+
# Insert results into spreadsheet cell
|
105
|
+
|
106
|
+
@sheet[@row_count + 2, 0 ] = "Mean"
|
107
|
+
@sheet[@row_count + 2, 0 + c ] = @mean
|
108
|
+
|
109
|
+
@sheet[@row_count + 3, 0 ] = "Median"
|
110
|
+
@sheet[@row_count + 3, 0 + c ] = @median
|
111
|
+
|
112
|
+
@sheet[@row_count + 4, 0 ] = "Range"
|
113
|
+
@sheet[@row_count + 4, 0 + c ] = @range
|
114
|
+
|
115
|
+
@sheet[@row_count + 5, 0 ] = "Mode"
|
116
|
+
@sheet[@row_count + 5, 0 + c ] = @mode
|
117
|
+
|
118
|
+
@sheet[@row_count + 6, 0 ] = "Standard Deviation"
|
119
|
+
@sheet[@row_count + 6, 0 + c ] = @standard_dev
|
120
|
+
end
|
43
121
|
end
|
44
122
|
|
45
123
|
def report_name
|
@@ -51,14 +129,11 @@ module MovieDB
|
|
51
129
|
|
52
130
|
def insert_data_to_existing_xls_file
|
53
131
|
filename = ("#{report_name}.xls")
|
54
|
-
#@book.worksheet(0).insert_row(4, [@data_processing ])
|
55
|
-
@sheet[5, 1] = @data_processing
|
56
|
-
@sheet.row(6).push "Median", @data_processing
|
57
|
-
|
58
132
|
@book.write File.join('reports', filename)
|
59
133
|
return filename
|
60
134
|
end
|
61
135
|
end
|
136
|
+
module Coefficient_Of_Determination; end
|
62
137
|
module Discrete_Least_Squares_Meshless_Method; end
|
63
138
|
module Explained_Sum_Of_Squares; end
|
64
139
|
module Fraction_Of_Variance_Unexplained; end
|
data/lib/movieDB/data_process.rb
CHANGED
@@ -3,6 +3,7 @@ require 'MovieDB/data_analysis'
|
|
3
3
|
module MovieDB
|
4
4
|
class DataProcess
|
5
5
|
PATH_AOV = MovieDB::DataAnalysis::AnalysisOfVariance::LeastSquares
|
6
|
+
extend PATH_AOV::Statistic
|
6
7
|
extend PATH_AOV::Coefficient_Of_Determination
|
7
8
|
include PATH_AOV::Explained_Sum_Of_Squares
|
8
9
|
include PATH_AOV::Fraction_Of_Variance_Unexplained
|
data/lib/movieDB/version.rb
CHANGED
Binary file
|
data/spec/data_process_spec.rb
CHANGED
@@ -5,11 +5,10 @@ describe MovieDB::DataProcess do
|
|
5
5
|
describe "#AnalysisOfVariance" do
|
6
6
|
describe "#LeastSquares" do
|
7
7
|
describe "#Coefficient_Of_Determination" do
|
8
|
-
let(:
|
9
|
-
#let(:cod) {MovieDB::DataProcess.analyze_cod(imdb_raw_data_2013121820.xls)}
|
8
|
+
let(:basic_stat) {MovieDB::DataProcess}
|
10
9
|
|
11
10
|
it "should return the cof" do
|
12
|
-
|
11
|
+
basic_stat.send(:basic_statistic, 'imdb_raw_data_2013121911.xls').should == []
|
13
12
|
end
|
14
13
|
|
15
14
|
it "raise error if file does not exist" do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: movieDB
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Albert_McKeever
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-12-
|
11
|
+
date: 2013-12-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -164,10 +164,8 @@ files:
|
|
164
164
|
- movieDB.gemspec
|
165
165
|
- npm-debug.log
|
166
166
|
- reports/.DS_Store
|
167
|
-
- reports/
|
168
|
-
- reports/add.xls
|
167
|
+
- reports/Statistic_2013122017.xls
|
169
168
|
- reports/imdb_raw_data_2013121911.xls
|
170
|
-
- reports/imdb_raw_data_2013121912.xls
|
171
169
|
- spec/.DS_Store
|
172
170
|
- spec/data_export_spec.rb
|
173
171
|
- spec/data_process_spec.rb
|
data/reports/add.xls
DELETED
Binary file
|
Binary file
|