spout 0.8.0.beta5 → 0.8.0.beta6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -0
- data/README.md +12 -0
- data/lib/spout/actions.rb +9 -1
- data/lib/spout/commands/outliers.rb +59 -0
- data/lib/spout/helpers/array_statistics.rb +30 -0
- data/lib/spout/helpers/json_loader.rb +17 -0
- data/lib/spout/models/outlier_result.rb +37 -0
- data/lib/spout/tasks/engine.rake +7 -0
- data/lib/spout/version.rb +1 -1
- data/lib/spout/views/outliers.html.erb +150 -0
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e6045b21354f8f0af4c207823913ef0a75a12fb7
|
4
|
+
data.tar.gz: aa6e0e604deeec131d8641370c674df7d411065f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5a5c0432dd78583502043ad987fdf75727b5322de35bf799b1d9ea6c114129a823cc057463785c662c4867ede530a8425fcddd1c81788014d459f85837e95af5
|
7
|
+
data.tar.gz: 0a95fe2f9d085d2f7108f7fa2d22f5ac33c43a9361cdaf54d226913a2d085dd718f21ad0c1c4006b61effe9950d4d54c8f234c6e7c44da23f0accb146a94c220
|
data/CHANGELOG.md
CHANGED
@@ -7,6 +7,7 @@
|
|
7
7
|
- `charts`: Array of choices, numeric, or integer variables for charts
|
8
8
|
- The `spout pngs` command now renders the histogram form for each variable
|
9
9
|
- The `spout coverage` command now lists variables that are defined in the data dictionary and that do not exist in any CSV dataset
|
10
|
+
- Added `spout outliers` command that returns a list of integer or numeric variables that contain major and minor outliers
|
10
11
|
- **Gem Changes**
|
11
12
|
- Updated to colorize 0.7.2
|
12
13
|
- Use of Ruby 2.1.2 is now recommended
|
data/README.md
CHANGED
@@ -132,6 +132,18 @@ This will generate an `index.html` file that can be opened and viewed in any bro
|
|
132
132
|
|
133
133
|
Spout coverage validates that values stored in your dataset match up with variables and domains defined in your data dictionary.
|
134
134
|
|
135
|
+
### Identify outliers in your dataset
|
136
|
+
|
137
|
+
Spout lets you generate detect outliers in your underlying datasets. Place your dataset csvs into `./csvs/<version>/` and then run the following Spout command:
|
138
|
+
|
139
|
+
```
|
140
|
+
spout outliers
|
141
|
+
```
|
142
|
+
|
143
|
+
This will generate an `outliers.html` file that can be opened and viewed in any browser.
|
144
|
+
|
145
|
+
Spout outliers computes the [inner and outer fences](http://www.wikihow.com/Calculate-Outliers) to identify minor and major outliers in the dataset.
|
146
|
+
|
135
147
|
### Create a CSV Data Dictionary from your JSON repository
|
136
148
|
|
137
149
|
Provide an optional version parameter to name the folder the CSVs will be generated in, defaults to what is in `VERSION` file, or if that does not exist `1.0.0`.
|
data/lib/spout/actions.rb
CHANGED
@@ -25,6 +25,8 @@ module Spout
|
|
25
25
|
generate_images(argv.last(argv.size - 1))
|
26
26
|
when 'graphs', '-graphs', '--graphs', 'g', '-g'
|
27
27
|
generate_charts_and_tables(argv.last(argv.size - 1))
|
28
|
+
when 'outliers', '-outliers', '--outliers', 'o', '-o'
|
29
|
+
outliers_report(argv)
|
28
30
|
else
|
29
31
|
help
|
30
32
|
end
|
@@ -69,7 +71,9 @@ The most common spout commands are:
|
|
69
71
|
h[y]brid [1.0.0] Export the JSON dictionary in the Hybrid
|
70
72
|
dictionary format
|
71
73
|
[c]overage Coverage report, requires dataset CSVs
|
72
|
-
in `<project_name>/csvs
|
74
|
+
in `<project_name>/csvs/<version>`
|
75
|
+
[o]utliers Outlier report, requires dataset CSVs
|
76
|
+
in `<project_name>/csvs/<version>`
|
73
77
|
[p]ngs Generates images for each variable in a
|
74
78
|
dataset and places them
|
75
79
|
in `<project_name>/images/<version>/`
|
@@ -130,6 +134,10 @@ EOT
|
|
130
134
|
system "bundle exec rake spout:coverage"
|
131
135
|
end
|
132
136
|
|
137
|
+
def outliers_report(argv)
|
138
|
+
system "bundle exec rake spout:outliers"
|
139
|
+
end
|
140
|
+
|
133
141
|
def flag_values(flags, param)
|
134
142
|
flags.select{|f| f[0..((param.size + 3) - 1)] == "--#{param}-" and f.length > param.size + 3}.collect{|f| f[(param.size + 3)..-1]}
|
135
143
|
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
|
3
|
+
require 'spout/helpers/subject_loader'
|
4
|
+
require 'spout/models/outlier_result'
|
5
|
+
|
6
|
+
module Spout
|
7
|
+
module Commands
|
8
|
+
class Outliers
|
9
|
+
def initialize(standard_version)
|
10
|
+
@standard_version = standard_version
|
11
|
+
|
12
|
+
@variable_files = Dir.glob('variables/**/*.json')
|
13
|
+
@valid_ids = []
|
14
|
+
@number_of_rows = nil
|
15
|
+
|
16
|
+
spout_config = YAML.load_file('.spout.yml')
|
17
|
+
@visit = (spout_config.kind_of?(Hash) ? spout_config['visit'].to_s.strip : '')
|
18
|
+
|
19
|
+
@subject_loader = Spout::Helpers::SubjectLoader.new(@variable_files, @valid_ids, @standard_version, @number_of_rows, @visit)
|
20
|
+
@subject_loader.load_subjects_from_csvs!
|
21
|
+
@subjects = @subject_loader.subjects
|
22
|
+
end
|
23
|
+
|
24
|
+
def run_outliers_report!
|
25
|
+
@outlier_results = @subject_loader.all_methods.collect do |method, csv_files|
|
26
|
+
Spout::Models::OutlierResult.new(@subjects, method, csv_files)
|
27
|
+
end
|
28
|
+
|
29
|
+
@outlier_results.sort!{|a,b| [a.weight, a.method] <=> [b.weight, b.method]}
|
30
|
+
|
31
|
+
@csv_files = Dir.glob("csvs/#{@standard_version}/*.csv")
|
32
|
+
@overall_results = @csv_files.collect do |csv_file|
|
33
|
+
major_outliers = @outlier_results.select{|outlier_result| outlier_result.csv_files.include?(csv_file) and outlier_result.weight == 0 }.count
|
34
|
+
minor_outliers = @outlier_results.select{|outlier_result| outlier_result.csv_files.include?(csv_file) and outlier_result.weight == 1 }.count
|
35
|
+
total_outliers = major_outliers + minor_outliers
|
36
|
+
[ csv_file, major_outliers, minor_outliers, total_outliers ]
|
37
|
+
end
|
38
|
+
|
39
|
+
coverage_folder = File.join(Dir.pwd, 'coverage')
|
40
|
+
FileUtils.mkpath coverage_folder
|
41
|
+
html_file = File.join(coverage_folder, 'outliers.html')
|
42
|
+
|
43
|
+
print "\nGenerating: outliers.html\n\n"
|
44
|
+
|
45
|
+
File.open(html_file, 'w+') do |file|
|
46
|
+
erb_location = File.join( File.dirname(__FILE__), '../views/outliers.html.erb' )
|
47
|
+
file.puts ERB.new(File.read(erb_location)).result(binding)
|
48
|
+
end
|
49
|
+
|
50
|
+
open_command = 'open' if RUBY_PLATFORM.match(/darwin/) != nil
|
51
|
+
open_command = 'start' if RUBY_PLATFORM.match(/mingw/) != nil
|
52
|
+
|
53
|
+
system "#{open_command} #{html_file}" if ['start', 'open'].include?(open_command)
|
54
|
+
puts "#{html_file}\n\n"
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -80,6 +80,36 @@ class Array
|
|
80
80
|
self.compact.max
|
81
81
|
end
|
82
82
|
|
83
|
+
def outliers
|
84
|
+
array = self.compact.sort.select{|v| v.kind_of?(Numeric)}
|
85
|
+
q1 = (array.quartile_one + array.quartile_two).median
|
86
|
+
q3 = (array.quartile_three + array.quartile_four).median
|
87
|
+
return [] if q1 == nil or q3 == nil
|
88
|
+
iq_range = q3 - q1
|
89
|
+
inner_fence_lower = q1 - iq_range * 1.5
|
90
|
+
inner_fence_upper = q3 + iq_range * 1.5
|
91
|
+
outer_fence_lower = q1 - iq_range * 3
|
92
|
+
outer_fence_upper = q3 + iq_range * 3
|
93
|
+
array.select{ |v| v > inner_fence_upper or v < inner_fence_lower }
|
94
|
+
end
|
95
|
+
|
96
|
+
def major_outliers
|
97
|
+
array = self.compact.sort.select{|v| v.kind_of?(Numeric)}
|
98
|
+
q1 = (array.quartile_one + array.quartile_two).median
|
99
|
+
q3 = (array.quartile_three + array.quartile_four).median
|
100
|
+
return [] if q1 == nil or q3 == nil
|
101
|
+
iq_range = q3 - q1
|
102
|
+
inner_fence_lower = q1 - iq_range * 1.5
|
103
|
+
inner_fence_upper = q3 + iq_range * 1.5
|
104
|
+
outer_fence_lower = q1 - iq_range * 3
|
105
|
+
outer_fence_upper = q3 + iq_range * 3
|
106
|
+
array.select{ |v| v > outer_fence_upper or v < outer_fence_lower }
|
107
|
+
end
|
108
|
+
|
109
|
+
def minor_outliers
|
110
|
+
self.outliers - self.major_outliers
|
111
|
+
end
|
112
|
+
|
83
113
|
end
|
84
114
|
|
85
115
|
module Spout
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Spout
|
2
|
+
module Helpers
|
3
|
+
class JsonLoader
|
4
|
+
|
5
|
+
def self.get_json(file_name, file_type)
|
6
|
+
file = Dir.glob("#{file_type}s/**/#{file_name}.json").first
|
7
|
+
json = JSON.parse(File.read(file)) rescue json = nil
|
8
|
+
json
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.get_variable(variable_name)
|
12
|
+
get_json(variable_name, 'variable')
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'spout/helpers/array_statistics'
|
2
|
+
require 'spout/helpers/json_loader'
|
3
|
+
|
4
|
+
module Spout
|
5
|
+
module Models
|
6
|
+
class OutlierResult
|
7
|
+
attr_reader :csv_files, :method, :major_outliers, :minor_outliers, :outliers, :weight, :units
|
8
|
+
|
9
|
+
def initialize(subjects, method, csv_files)
|
10
|
+
@values = subjects.collect(&method.to_sym)
|
11
|
+
@csv_files = csv_files
|
12
|
+
@method = method
|
13
|
+
|
14
|
+
|
15
|
+
calculate_outliers!
|
16
|
+
|
17
|
+
@weight = if @major_outliers.count > 0
|
18
|
+
0
|
19
|
+
elsif @minor_outliers.count > 0
|
20
|
+
1
|
21
|
+
else
|
22
|
+
2
|
23
|
+
end
|
24
|
+
variable = Spout::Helpers::JsonLoader::get_variable(method)
|
25
|
+
@units = (variable.kind_of?(Hash) ? variable['units'] : nil)
|
26
|
+
end
|
27
|
+
|
28
|
+
def calculate_outliers!
|
29
|
+
@major_outliers = @values.major_outliers.uniq
|
30
|
+
@minor_outliers = @values.minor_outliers.uniq
|
31
|
+
@outliers = @values.outliers.uniq
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
data/lib/spout/tasks/engine.rake
CHANGED
@@ -50,6 +50,13 @@ namespace :spout do
|
|
50
50
|
Spout::Commands::Coverage.new(standard_version)
|
51
51
|
end
|
52
52
|
|
53
|
+
desc 'Identify Outliers in CSV dataset'
|
54
|
+
task :outliers do
|
55
|
+
require 'spout/commands/outliers'
|
56
|
+
outliers = Spout::Commands::Outliers.new(standard_version)
|
57
|
+
outliers.run_outliers_report!
|
58
|
+
end
|
59
|
+
|
53
60
|
desc 'Match CSV dataset with JSON repository'
|
54
61
|
task :images do
|
55
62
|
require 'spout/commands/images'
|
data/lib/spout/version.rb
CHANGED
@@ -0,0 +1,150 @@
|
|
1
|
+
<html lang="en"><head>
|
2
|
+
<meta charset="utf-8">
|
3
|
+
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
4
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
5
|
+
<meta name="description" content="">
|
6
|
+
<meta name="author" content="">
|
7
|
+
<link rel="shortcut icon" href="">
|
8
|
+
|
9
|
+
<title>Spout Coverage</title>
|
10
|
+
|
11
|
+
<!-- Bootstrap core CSS -->
|
12
|
+
<link href="http://netdna.bootstrapcdn.com/bootstrap/3.1.1/css/bootstrap.min.css" rel="stylesheet">
|
13
|
+
|
14
|
+
<!-- HTML5 shim and Respond.js IE8 support of HTML5 elements and media queries -->
|
15
|
+
<!--[if lt IE 9]>
|
16
|
+
<script src="https://oss.maxcdn.com/libs/html5shiv/3.7.0/html5shiv.js"></script>
|
17
|
+
<script src="https://oss.maxcdn.com/libs/respond.js/1.4.2/respond.min.js"></script>
|
18
|
+
<![endif]-->
|
19
|
+
|
20
|
+
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.0/jquery.min.js"></script>
|
21
|
+
<script src="http://netdna.bootstrapcdn.com/bootstrap/3.1.1/js/bootstrap.min.js"></script>
|
22
|
+
<style type="text/css">
|
23
|
+
html {
|
24
|
+
position: relative;
|
25
|
+
min-height: 100%;
|
26
|
+
}
|
27
|
+
body {
|
28
|
+
padding-top: 50px;
|
29
|
+
/* Margin bottom by footer height */
|
30
|
+
margin-bottom: 60px;
|
31
|
+
}
|
32
|
+
#footer {
|
33
|
+
position: absolute;
|
34
|
+
bottom: 0;
|
35
|
+
width: 100%;
|
36
|
+
/* Set the fixed height of the footer here */
|
37
|
+
height: 60px;
|
38
|
+
background-color: #f5f5f5;
|
39
|
+
padding: 0px 30px;
|
40
|
+
}
|
41
|
+
code.success {
|
42
|
+
color: #468847;
|
43
|
+
background-color: #dff0d8;
|
44
|
+
}
|
45
|
+
code.default {
|
46
|
+
color: #333;
|
47
|
+
background-color: #f5f5f5;
|
48
|
+
}
|
49
|
+
code.danger {
|
50
|
+
color: #FFF;
|
51
|
+
background-color: #d9534f;
|
52
|
+
}
|
53
|
+
code.warning {
|
54
|
+
color: #333;
|
55
|
+
background-color: #f0ad4e;
|
56
|
+
}
|
57
|
+
tfoot td {
|
58
|
+
vertical-align: middle !important;
|
59
|
+
padding-bottom: 0px !important;
|
60
|
+
}
|
61
|
+
</style>
|
62
|
+
</head>
|
63
|
+
|
64
|
+
<body>
|
65
|
+
|
66
|
+
<div class="navbar navbar-inverse navbar-fixed-top" role="navigation">
|
67
|
+
<div class="navbar-header" style="padding: 0px 30px">
|
68
|
+
<a class="navbar-brand" href="#">Spout Outliers</a>
|
69
|
+
</div>
|
70
|
+
</div>
|
71
|
+
|
72
|
+
<% if @subject_loader.all_methods.size == 0 %>
|
73
|
+
<div class="container" style="margin-top:30px">
|
74
|
+
<div class="jumbotron">
|
75
|
+
<h1>You made Spout cry... :'-(</h1>
|
76
|
+
<p>No CSVs found in <code><%= Dir.pwd %>/csvs/<%= @standard_version %>/</code></p>
|
77
|
+
</div>
|
78
|
+
</div>
|
79
|
+
<% else %>
|
80
|
+
<div style="padding: 30px 30px 10px 30px;">
|
81
|
+
|
82
|
+
<table class="table">
|
83
|
+
<thead>
|
84
|
+
<tr>
|
85
|
+
<th>CSV</th>
|
86
|
+
<th>Major Outliers</th>
|
87
|
+
<th>Minor Outliers</th>
|
88
|
+
<th>Total Outliers</th>
|
89
|
+
</tr>
|
90
|
+
</thead>
|
91
|
+
<tbody>
|
92
|
+
<% @overall_results.each do |csv_file, major_outliers, minor_outliers, total_outliers| %>
|
93
|
+
<tr>
|
94
|
+
<td><%= csv_file %></td>
|
95
|
+
<td><%= major_outliers %></td>
|
96
|
+
<td><%= minor_outliers %></td>
|
97
|
+
<td><%= total_outliers %></td>
|
98
|
+
</tr>
|
99
|
+
<% end %>
|
100
|
+
</tbody>
|
101
|
+
</table>
|
102
|
+
|
103
|
+
|
104
|
+
<table class="table table-bordered table-hover">
|
105
|
+
<thead>
|
106
|
+
<tr>
|
107
|
+
<th>CSV</th>
|
108
|
+
<th>Column</th>
|
109
|
+
<th>Units</th>
|
110
|
+
<th>Major Outliers</th>
|
111
|
+
<th>Minor Outliers</th>
|
112
|
+
<th>Outliers</th>
|
113
|
+
</tr>
|
114
|
+
</thead>
|
115
|
+
<tbody>
|
116
|
+
<% @outlier_results.each do |outlier_result| %>
|
117
|
+
<tr>
|
118
|
+
<td>
|
119
|
+
<% outlier_result.csv_files.each do |csv_file| %>
|
120
|
+
<code class="<%= 'success' if outlier_result.outliers.size == 0 %>"><%= csv_file %><code><br />
|
121
|
+
<% end %></td>
|
122
|
+
<td><%= outlier_result.method %></td>
|
123
|
+
<td><%= outlier_result.units %></td>
|
124
|
+
<td>
|
125
|
+
<% outlier_result.major_outliers.collect{|v| v.round(2)}.uniq.each do |value| %>
|
126
|
+
<code class="danger"><%= value %></code>
|
127
|
+
<% end %>
|
128
|
+
</td>
|
129
|
+
<td>
|
130
|
+
<% outlier_result.minor_outliers.collect{|v| v.round(2)}.uniq.each do |value| %>
|
131
|
+
<code class="warning"><%= value %></code>
|
132
|
+
<% end %>
|
133
|
+
</td>
|
134
|
+
<td>
|
135
|
+
<% outlier_result.outliers.collect{|v| v.round(2)}.uniq.each do |value| %>
|
136
|
+
<code class="default"><%= value %></code>
|
137
|
+
<% end %>
|
138
|
+
</td>
|
139
|
+
</tr>
|
140
|
+
<% end %>
|
141
|
+
</tbody>
|
142
|
+
</table>
|
143
|
+
</div>
|
144
|
+
<% end %>
|
145
|
+
|
146
|
+
<div id="footer">
|
147
|
+
<p class="text-muted" style="margin: 20px 0;">Generated by <a href="https://github.com/sleepepi/spout">Spout</a> v<%= Spout::VERSION::STRING %></p>
|
148
|
+
</div>
|
149
|
+
|
150
|
+
</body></html>
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spout
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.0.
|
4
|
+
version: 0.8.0.beta6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Remo Mueller
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-05-
|
11
|
+
date: 2014-05-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -100,12 +100,15 @@ files:
|
|
100
100
|
- lib/spout/commands/coverage.rb
|
101
101
|
- lib/spout/commands/graphs.rb
|
102
102
|
- lib/spout/commands/images.rb
|
103
|
+
- lib/spout/commands/outliers.rb
|
103
104
|
- lib/spout/helpers/array_statistics.rb
|
104
105
|
- lib/spout/helpers/chart_types.rb
|
106
|
+
- lib/spout/helpers/json_loader.rb
|
105
107
|
- lib/spout/helpers/subject_loader.rb
|
106
108
|
- lib/spout/helpers/table_formatting.rb
|
107
109
|
- lib/spout/hidden_reporter.rb
|
108
110
|
- lib/spout/models/coverage_result.rb
|
111
|
+
- lib/spout/models/outlier_result.rb
|
109
112
|
- lib/spout/models/subject.rb
|
110
113
|
- lib/spout/support/javascripts/data.js
|
111
114
|
- lib/spout/support/javascripts/highcharts-convert.js
|
@@ -134,6 +137,7 @@ files:
|
|
134
137
|
- lib/spout/tests/variable_type_validation.rb
|
135
138
|
- lib/spout/version.rb
|
136
139
|
- lib/spout/views/index.html.erb
|
140
|
+
- lib/spout/views/outliers.html.erb
|
137
141
|
- spout.gemspec
|
138
142
|
homepage: https://github.com/sleepepi
|
139
143
|
licenses:
|