chicagowarehouse 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.rspec +1 -0
- data/Gemfile +18 -0
- data/LICENSE +20 -0
- data/README +11 -0
- data/Rakefile +50 -0
- data/chicagowarehouse.gemspec +134 -0
- data/lib/chicago.rb +32 -0
- data/lib/chicago/core_ext/hash.rb +18 -0
- data/lib/chicago/core_ext/sequel/dataset.rb +7 -0
- data/lib/chicago/core_ext/sequel/sql.rb +62 -0
- data/lib/chicago/data/month.rb +98 -0
- data/lib/chicago/database/constants.rb +18 -0
- data/lib/chicago/database/dataset_builder.rb +75 -0
- data/lib/chicago/database/filter.rb +109 -0
- data/lib/chicago/database/migration_file_writer.rb +34 -0
- data/lib/chicago/database/schema_generator.rb +117 -0
- data/lib/chicago/database/type_converters.rb +107 -0
- data/lib/chicago/database/value_parser.rb +23 -0
- data/lib/chicago/errors.rb +23 -0
- data/lib/chicago/query.rb +109 -0
- data/lib/chicago/rake_tasks.rb +50 -0
- data/lib/chicago/schema/builders/column_builder.rb +21 -0
- data/lib/chicago/schema/builders/dimension_builder.rb +69 -0
- data/lib/chicago/schema/builders/fact_builder.rb +74 -0
- data/lib/chicago/schema/builders/shrunken_dimension_builder.rb +54 -0
- data/lib/chicago/schema/builders/table_builder.rb +33 -0
- data/lib/chicago/schema/column.rb +221 -0
- data/lib/chicago/schema/column_parser.rb +127 -0
- data/lib/chicago/schema/dimension.rb +129 -0
- data/lib/chicago/schema/dimension_reference.rb +47 -0
- data/lib/chicago/schema/fact.rb +70 -0
- data/lib/chicago/schema/measure.rb +35 -0
- data/lib/chicago/schema/named_element.rb +16 -0
- data/lib/chicago/schema/named_element_collection.rb +64 -0
- data/lib/chicago/schema/query_column.rb +199 -0
- data/lib/chicago/schema/table.rb +41 -0
- data/lib/chicago/star_schema.rb +127 -0
- data/spec/core_ext/sequel_extensions_spec.rb +29 -0
- data/spec/data/month_spec.rb +67 -0
- data/spec/database/db_type_converter_spec.rb +125 -0
- data/spec/database/migration_file_writer_spec.rb +37 -0
- data/spec/database/schema_generator_spec.rb +199 -0
- data/spec/db_connections.yml.dist +4 -0
- data/spec/query_spec.rb +495 -0
- data/spec/schema/column_spec.rb +213 -0
- data/spec/schema/dimension_builder_spec.rb +32 -0
- data/spec/schema/dimension_reference_spec.rb +90 -0
- data/spec/schema/dimension_spec.rb +111 -0
- data/spec/schema/fact_spec.rb +83 -0
- data/spec/schema/measure_spec.rb +27 -0
- data/spec/schema/named_element_collection_spec.rb +67 -0
- data/spec/schema/pivoted_column_spec.rb +17 -0
- data/spec/schema/query_column_spec.rb +120 -0
- data/spec/spec_helper.rb +20 -0
- data/spec/star_schema_spec.rb +219 -0
- data/spec/support/matchers/be_one_of.rb +11 -0
- data/spec/support/matchers/column_matchers.rb +11 -0
- data/spec/support/shared_examples/column.rb +13 -0
- data/spec/support/shared_examples/schema_table.rb +17 -0
- data/spec/support/shared_examples/schema_visitor.rb +25 -0
- data/tasks/stats.rake +108 -0
- metadata +300 -0
@@ -0,0 +1,221 @@
|
|
1
|
+
require 'chicago/schema/named_element'
|
2
|
+
|
3
|
+
module Chicago
|
4
|
+
module Schema
|
5
|
+
# A column in a dimension or fact record.
|
6
|
+
#
|
7
|
+
# The column definition is used to generate the options
|
8
|
+
# to create the column in the database schema, but also
|
9
|
+
# to provide an abstract definition of the column for views
|
10
|
+
# and other Data Warehouse code.
|
11
|
+
#
|
12
|
+
# You shouldn't need to create a Column manually - they
|
13
|
+
# are generally defined using the schema definition DSL.
|
14
|
+
#
|
15
|
+
# @api public
|
16
|
+
class Column
|
17
|
+
include Schema::NamedElement
|
18
|
+
|
19
|
+
# Creates a new column definition.
|
20
|
+
#
|
21
|
+
# name:: the name of the column.
|
22
|
+
# column_type:: the abstract type of the column. For example, :string.
|
23
|
+
#
|
24
|
+
# Options:
|
25
|
+
#
|
26
|
+
# min:: the minimum length/number of this column.
|
27
|
+
# max:: the maximum length/number of this column.
|
28
|
+
# range:: any object with a min & max method - overrides min/max (above).
|
29
|
+
# null:: whether this column can be null. False by default.
|
30
|
+
# elements:: the allowed values this column can take.
|
31
|
+
# default:: the default value for this column.
|
32
|
+
# descriptive:: whether this column is purely descriptive and
|
33
|
+
# won't be used for grouping/filtering.
|
34
|
+
# internal:: the column is for internal use only, and shouldn't
|
35
|
+
# be displayed/used directly in a user context
|
36
|
+
# optional:: the column isn't expected to be populated.
|
37
|
+
#
|
38
|
+
# @api private
|
39
|
+
def initialize(name, column_type, opts={})
|
40
|
+
@opts = normalize_opts(column_type, opts)
|
41
|
+
|
42
|
+
super name, opts
|
43
|
+
|
44
|
+
@column_type = column_type
|
45
|
+
if @opts[:countable].kind_of?(String)
|
46
|
+
@countable_label = @opts[:countable]
|
47
|
+
end
|
48
|
+
@countable = !! @opts[:countable]
|
49
|
+
@min = @opts[:min]
|
50
|
+
@max = @opts[:max]
|
51
|
+
@null = @opts[:null]
|
52
|
+
@elements = @opts[:elements]
|
53
|
+
@default = @opts[:default]
|
54
|
+
@descriptive = !! @opts[:descriptive]
|
55
|
+
@internal = !! @opts[:internal]
|
56
|
+
@optional = !! (@opts.has_key?(:optional) ? @opts[:optional] : @opts[:null])
|
57
|
+
end
|
58
|
+
|
59
|
+
# Returns the type of this column. This is an abstract type,
|
60
|
+
# not a database type (for example :string, not :varchar).
|
61
|
+
attr_reader :column_type
|
62
|
+
|
63
|
+
# Returns the minimum value of this column, or nil.
|
64
|
+
attr_reader :min
|
65
|
+
|
66
|
+
# Returns the minimum value of this column, or nil.
|
67
|
+
attr_reader :max
|
68
|
+
|
69
|
+
# Returns an Array of allowed elements, or nil.
|
70
|
+
attr_reader :elements
|
71
|
+
|
72
|
+
# Returns the explicit default as set in the database, or nil.
|
73
|
+
attr_reader :default
|
74
|
+
|
75
|
+
# Returns the calculated default value.
|
76
|
+
#
|
77
|
+
# This may be different from the explicit default - for example
|
78
|
+
# a boolean not-null column that has no explicit default will
|
79
|
+
# have a default of nil, and a default value of false.
|
80
|
+
#
|
81
|
+
# The distinction is important, otherwise values can end up
|
82
|
+
# missing in junk dimensions due to differences between what
|
83
|
+
# ruby and the database considers unique.
|
84
|
+
def default_value
|
85
|
+
if @default || null?
|
86
|
+
@default
|
87
|
+
elsif @column_type == :boolean
|
88
|
+
false
|
89
|
+
elsif numeric?
|
90
|
+
0
|
91
|
+
elsif textual?
|
92
|
+
''
|
93
|
+
else
|
94
|
+
nil
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
attr_reader :countable_label
|
99
|
+
|
100
|
+
alias :key_name :name
|
101
|
+
|
102
|
+
# Returns true if this column can be counted.
|
103
|
+
def countable?
|
104
|
+
@countable
|
105
|
+
end
|
106
|
+
|
107
|
+
# Returns true if this column should be indexed
|
108
|
+
def indexed?
|
109
|
+
! descriptive?
|
110
|
+
end
|
111
|
+
|
112
|
+
# Returns true if this column is optional.
|
113
|
+
#
|
114
|
+
# Will be defaulted from whether the column allows null values,
|
115
|
+
# can be overridden (for dates etc).
|
116
|
+
def optional?
|
117
|
+
@optional
|
118
|
+
end
|
119
|
+
|
120
|
+
# Returns true if this column should be ignored in user-facing
|
121
|
+
# parts of an application
|
122
|
+
def internal?
|
123
|
+
@internal
|
124
|
+
end
|
125
|
+
|
126
|
+
# Returns true if null values are allowed.
|
127
|
+
def null?
|
128
|
+
@null
|
129
|
+
end
|
130
|
+
|
131
|
+
# Returns true if this column is just informational, and is not
|
132
|
+
# intended to be used as a filter.
|
133
|
+
def descriptive?
|
134
|
+
@descriptive
|
135
|
+
end
|
136
|
+
|
137
|
+
# Returns true if both definition's attributes are equal.
|
138
|
+
def ==(other)
|
139
|
+
other.kind_of?(self.class) &&
|
140
|
+
name == other.name &&
|
141
|
+
column_type == other.column_type &&
|
142
|
+
@opts == other.instance_variable_get(:@opts)
|
143
|
+
end
|
144
|
+
|
145
|
+
# Returns true if this column stores a numeric value.
|
146
|
+
def numeric?
|
147
|
+
@numeric ||= [:integer, :money, :percent, :decimal, :float].include?(column_type)
|
148
|
+
end
|
149
|
+
|
150
|
+
# Returns true if the column stores a textual value.
|
151
|
+
def textual?
|
152
|
+
@textual ||= [:string, :text].include?(column_type)
|
153
|
+
end
|
154
|
+
|
155
|
+
def hash #:nodoc:
|
156
|
+
name.hash
|
157
|
+
end
|
158
|
+
|
159
|
+
# Returns a hash of column options.
|
160
|
+
def to_hash
|
161
|
+
db_schema = {
|
162
|
+
:name => name,
|
163
|
+
:column_type => column_type,
|
164
|
+
:null => null?
|
165
|
+
}
|
166
|
+
db_schema[:default] = default if default || column_type == :timestamp
|
167
|
+
db_schema[:elements] = elements if elements
|
168
|
+
db_schema[:size] = size if size
|
169
|
+
db_schema[:unsigned] = !! unsigned? if numeric?
|
170
|
+
db_schema
|
171
|
+
end
|
172
|
+
|
173
|
+
def qualify_by(table)
|
174
|
+
name.qualify(table)
|
175
|
+
end
|
176
|
+
|
177
|
+
# Columns accept Visitors
|
178
|
+
def visit(visitor)
|
179
|
+
visitor.visit_column(self)
|
180
|
+
end
|
181
|
+
|
182
|
+
private
|
183
|
+
|
184
|
+
def unsigned?
|
185
|
+
return @unsigned if defined? @unsigned
|
186
|
+
default_unsigned = column_type == :percent || column_type == :money
|
187
|
+
@unsigned = min ? min >= 0 : default_unsigned
|
188
|
+
end
|
189
|
+
|
190
|
+
def size
|
191
|
+
@size ||= if @opts[:size]
|
192
|
+
@opts[:size]
|
193
|
+
elsif max && column_type == :string
|
194
|
+
max
|
195
|
+
elsif column_type == :money
|
196
|
+
[12,2]
|
197
|
+
elsif column_type == :percent
|
198
|
+
[6,3]
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
def normalize_opts(type, opts)
|
203
|
+
opts = {:null => default_null(type), :min => default_min(type)}.merge(opts)
|
204
|
+
if opts[:range]
|
205
|
+
opts[:min] = opts[:range].min
|
206
|
+
opts[:max] = opts[:range].max
|
207
|
+
opts.delete(:range)
|
208
|
+
end
|
209
|
+
opts
|
210
|
+
end
|
211
|
+
|
212
|
+
def default_null(type)
|
213
|
+
[:date, :timestamp, :datetime].include?(type)
|
214
|
+
end
|
215
|
+
|
216
|
+
def default_min(type)
|
217
|
+
0 if type == :money
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
221
|
+
end
|
@@ -0,0 +1,127 @@
|
|
1
|
+
require 'chicago/schema/query_column'
|
2
|
+
|
3
|
+
module Chicago
|
4
|
+
module Schema
|
5
|
+
# Parses AST column representations, returning an Array of
|
6
|
+
# QueryColumns.
|
7
|
+
#
|
8
|
+
# Columns can be simple dotted references, like
|
9
|
+
#
|
10
|
+
# "sales.product.name"
|
11
|
+
#
|
12
|
+
# calculations like:
|
13
|
+
#
|
14
|
+
# {:column => "sales.total", :op => "sum"}
|
15
|
+
#
|
16
|
+
# or pivoted calculations like:
|
17
|
+
#
|
18
|
+
# {:column => "sales.total",
|
19
|
+
# :op => "sum"
|
20
|
+
# :pivot => "sales.date.year"}
|
21
|
+
#
|
22
|
+
class ColumnParser
|
23
|
+
# Creates a new ColumnParser for a schema.
|
24
|
+
def initialize(schema)
|
25
|
+
@schema = schema
|
26
|
+
end
|
27
|
+
|
28
|
+
# Parses a column element. An element may be a string reference
|
29
|
+
# like "foo.bar", or more complicated like {:column =>
|
30
|
+
# "foo.bar", :op => "sum"}
|
31
|
+
#
|
32
|
+
# @return [Array<Column>] an array of columns. In most cases
|
33
|
+
# this will be a 1-element array, unless the column is
|
34
|
+
# pivoted.
|
35
|
+
def parse(elem)
|
36
|
+
[_parse(elem)].flatten
|
37
|
+
end
|
38
|
+
|
39
|
+
protected
|
40
|
+
|
41
|
+
# Returns an Array of values, given a column to pivot with.
|
42
|
+
#
|
43
|
+
# May be overridden by subclasses.
|
44
|
+
#
|
45
|
+
# @raise UnimplementedError if a column with unknown or too many
|
46
|
+
# elements is used as a pivot column. In future this
|
47
|
+
# restriction may be lifted.
|
48
|
+
def pivotable_elements(pivot_col)
|
49
|
+
if pivot_col.column_type == :boolean
|
50
|
+
[true, false]
|
51
|
+
elsif pivot_col.elements
|
52
|
+
pivot_col.elements
|
53
|
+
elsif has_pivotable_integer_range?(pivot_col)
|
54
|
+
(pivot_col.min..pivot_col.max).to_a
|
55
|
+
else
|
56
|
+
raise UnimplementedError.new("General pivoting not yet support")
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Returns true if an Integer column can be used as pivot column.
|
61
|
+
#
|
62
|
+
# Default is to allow columns with a range 500 wide or less to
|
63
|
+
# be used as pivot columns.
|
64
|
+
#
|
65
|
+
# May be overridden by subclasses
|
66
|
+
#
|
67
|
+
# @return Boolean true if this column can be pivoted.
|
68
|
+
def has_pivotable_integer_range?(pivot_col)
|
69
|
+
pivot_col.column_type == :integer &&
|
70
|
+
pivot_col.max &&
|
71
|
+
pivot_col.min &&
|
72
|
+
(pivot_col.max - pivot_col.min <= 500)
|
73
|
+
end
|
74
|
+
|
75
|
+
private
|
76
|
+
|
77
|
+
def _parse(elem)
|
78
|
+
elem.kind_of?(Hash) ? complex_column(elem) : simple_column(elem)
|
79
|
+
end
|
80
|
+
|
81
|
+
def complex_column(elem)
|
82
|
+
elem.symbolize_keys!
|
83
|
+
elem[:pivot] ? pivoted_column(elem) : calculated_column(elem)
|
84
|
+
end
|
85
|
+
|
86
|
+
def pivoted_column(elem)
|
87
|
+
pivoted_column = _parse(elem[:column])
|
88
|
+
pivoted_by = _parse(elem[:pivot])
|
89
|
+
unit = [:avg, :count].include?(elem[:op].to_sym) ? nil : 0
|
90
|
+
pivoted_column.pivot(pivoted_by, pivotable_elements(pivoted_by), unit).map do |c|
|
91
|
+
c.calculate(elem[:op].to_sym)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def simple_column(elem)
|
96
|
+
table, col = parse_parts(elem)
|
97
|
+
QueryColumn.column(table, col, elem.to_sym)
|
98
|
+
end
|
99
|
+
|
100
|
+
def calculated_column(elem)
|
101
|
+
col = _parse(elem[:column])
|
102
|
+
elem[:op] ? col.calculate(elem[:op].to_sym) : col
|
103
|
+
end
|
104
|
+
|
105
|
+
def parse_parts(str)
|
106
|
+
table, parts = parse_table(str)
|
107
|
+
col = table[parts.shift]
|
108
|
+
# To cope with bare dimension references.
|
109
|
+
col = table.original_key if col.nil?
|
110
|
+
|
111
|
+
if col.kind_of?(Chicago::Schema::Dimension)
|
112
|
+
table = col
|
113
|
+
col = parts.empty? ? table : table[parts.first]
|
114
|
+
end
|
115
|
+
|
116
|
+
[table, col]
|
117
|
+
end
|
118
|
+
|
119
|
+
def parse_table(str)
|
120
|
+
parts = str.split('.').map(&:to_sym)
|
121
|
+
root = parts.shift
|
122
|
+
table = @schema.fact(root) || @schema.dimension(root)
|
123
|
+
[table, parts]
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
@@ -0,0 +1,129 @@
|
|
1
|
+
require 'chicago/schema/table'
|
2
|
+
|
3
|
+
module Chicago
|
4
|
+
module Schema
|
5
|
+
# The conventional format for dimension table names
|
6
|
+
DIMENSION_TABLE_FORMAT = "dimension_%s".freeze
|
7
|
+
|
8
|
+
# The conventional format for key table names.
|
9
|
+
KEY_TABLE_FORMAT = "keys_%s".freeze
|
10
|
+
|
11
|
+
# A dimension in the star schema.
|
12
|
+
#
|
13
|
+
# Dimensions contain denormalized values from various source
|
14
|
+
# systems, and are used to group and filter the fact tables. They
|
15
|
+
# may also be queried themselves.
|
16
|
+
#
|
17
|
+
# You shouldn't need to initialize a Dimension yourself - they
|
18
|
+
# should be created via StarSchema#define_dimension.
|
19
|
+
#
|
20
|
+
# @api public
|
21
|
+
class Dimension < Table
|
22
|
+
# Returns an array of Columns defined on this dimension.
|
23
|
+
#
|
24
|
+
# @see Chicago::Schema::Column.
|
25
|
+
attr_reader :columns
|
26
|
+
|
27
|
+
# @deprecated Use columns instead.
|
28
|
+
alias :column_definitions :columns
|
29
|
+
|
30
|
+
# Returns all the human-friendly identifying columns for this
|
31
|
+
# dimension.
|
32
|
+
#
|
33
|
+
# There is no expectation that identifying values will be unique,
|
34
|
+
# but they are intended to identify a single record in a user
|
35
|
+
# friendly way.
|
36
|
+
attr_reader :identifiers
|
37
|
+
|
38
|
+
# The table used to generate/store dimension keys.
|
39
|
+
attr_reader :key_table_name
|
40
|
+
|
41
|
+
# Creates a new Dimension, named +name+.
|
42
|
+
#
|
43
|
+
# @param name the name of the dimension
|
44
|
+
# @option opts [Array] columns
|
45
|
+
# @option opts [Array] identifiers
|
46
|
+
# @option opts [Array] null_records an array of attribute
|
47
|
+
# hashes, used to create null record rows in the database.
|
48
|
+
# Hashes must have an :id key.
|
49
|
+
# @option opts [Array<Symbol>] natual_key an array of symbols,
|
50
|
+
# representing a uniqueness constraint on the dimension.
|
51
|
+
# @option opts description a long text description about the dimension.
|
52
|
+
# @raise [Chicago::UnsafeNullRecordError]
|
53
|
+
def initialize(name, opts={})
|
54
|
+
super
|
55
|
+
@columns = opts[:columns] || []
|
56
|
+
@identifiers = opts[:identifiers] || []
|
57
|
+
@null_records = opts[:null_records] || []
|
58
|
+
@table_name = sprintf(DIMENSION_TABLE_FORMAT, name).to_sym
|
59
|
+
@key_table_name = sprintf(KEY_TABLE_FORMAT, @table_name).to_sym
|
60
|
+
@predetermined_values = !! opts[:predetermined_values]
|
61
|
+
check_null_records
|
62
|
+
end
|
63
|
+
|
64
|
+
# Creates null records in a Database.
|
65
|
+
#
|
66
|
+
# This will overwrite any records that share the id with the
|
67
|
+
# null record, so be careful.
|
68
|
+
#
|
69
|
+
# Optionally provide an overridden table name, if you need to
|
70
|
+
# create null records for a temporary version of the table.
|
71
|
+
def create_null_records(db, overridden_table_name=nil)
|
72
|
+
table_to_populate = overridden_table_name || table_name
|
73
|
+
unless @null_records.empty?
|
74
|
+
db[table_to_populate].insert_replace.
|
75
|
+
insert_multiple(@null_records)
|
76
|
+
if db.table_exists?(key_table_name)
|
77
|
+
ids = @null_records.map {|r| {:dimension_id => r[:id]} }
|
78
|
+
db[key_table_name].insert_replace.insert_multiple(ids)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
# Returns the main identifier for this record.
|
84
|
+
def main_identifier
|
85
|
+
@identifiers.first
|
86
|
+
end
|
87
|
+
|
88
|
+
# Returns true if this dimension can be identified as a concrete
|
89
|
+
# entity, with an original_id from a source system.
|
90
|
+
#
|
91
|
+
# @todo change to be consistent with identifiers
|
92
|
+
def identifiable?
|
93
|
+
!! original_key
|
94
|
+
end
|
95
|
+
|
96
|
+
# Returns true if the set of values for this dimension is
|
97
|
+
# pretermined.
|
98
|
+
#
|
99
|
+
# Examples of this may be date dimensions, currency dimensions
|
100
|
+
# etc.
|
101
|
+
def has_predetermined_values?
|
102
|
+
@predetermined_values
|
103
|
+
end
|
104
|
+
|
105
|
+
# Returns the column that represents the id in the original
|
106
|
+
# source for the dimension.
|
107
|
+
#
|
108
|
+
# Currently this column *must* be called +original_id+
|
109
|
+
#
|
110
|
+
# @todo make configurable.
|
111
|
+
def original_key
|
112
|
+
@original_key ||= @columns.detect {|c| c.name == :original_id }
|
113
|
+
end
|
114
|
+
|
115
|
+
# Dimensions accept Visitors
|
116
|
+
def visit(visitor)
|
117
|
+
visitor.visit_dimension(self)
|
118
|
+
end
|
119
|
+
|
120
|
+
private
|
121
|
+
|
122
|
+
def check_null_records
|
123
|
+
unless @null_records.all? {|h| h[:id] }
|
124
|
+
raise UnsafeNullRecordError.new "Null record defined without id field for dimension #{name}"
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|