chicagowarehouse 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. data/.document +5 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +18 -0
  4. data/LICENSE +20 -0
  5. data/README +11 -0
  6. data/Rakefile +50 -0
  7. data/chicagowarehouse.gemspec +134 -0
  8. data/lib/chicago.rb +32 -0
  9. data/lib/chicago/core_ext/hash.rb +18 -0
  10. data/lib/chicago/core_ext/sequel/dataset.rb +7 -0
  11. data/lib/chicago/core_ext/sequel/sql.rb +62 -0
  12. data/lib/chicago/data/month.rb +98 -0
  13. data/lib/chicago/database/constants.rb +18 -0
  14. data/lib/chicago/database/dataset_builder.rb +75 -0
  15. data/lib/chicago/database/filter.rb +109 -0
  16. data/lib/chicago/database/migration_file_writer.rb +34 -0
  17. data/lib/chicago/database/schema_generator.rb +117 -0
  18. data/lib/chicago/database/type_converters.rb +107 -0
  19. data/lib/chicago/database/value_parser.rb +23 -0
  20. data/lib/chicago/errors.rb +23 -0
  21. data/lib/chicago/query.rb +109 -0
  22. data/lib/chicago/rake_tasks.rb +50 -0
  23. data/lib/chicago/schema/builders/column_builder.rb +21 -0
  24. data/lib/chicago/schema/builders/dimension_builder.rb +69 -0
  25. data/lib/chicago/schema/builders/fact_builder.rb +74 -0
  26. data/lib/chicago/schema/builders/shrunken_dimension_builder.rb +54 -0
  27. data/lib/chicago/schema/builders/table_builder.rb +33 -0
  28. data/lib/chicago/schema/column.rb +221 -0
  29. data/lib/chicago/schema/column_parser.rb +127 -0
  30. data/lib/chicago/schema/dimension.rb +129 -0
  31. data/lib/chicago/schema/dimension_reference.rb +47 -0
  32. data/lib/chicago/schema/fact.rb +70 -0
  33. data/lib/chicago/schema/measure.rb +35 -0
  34. data/lib/chicago/schema/named_element.rb +16 -0
  35. data/lib/chicago/schema/named_element_collection.rb +64 -0
  36. data/lib/chicago/schema/query_column.rb +199 -0
  37. data/lib/chicago/schema/table.rb +41 -0
  38. data/lib/chicago/star_schema.rb +127 -0
  39. data/spec/core_ext/sequel_extensions_spec.rb +29 -0
  40. data/spec/data/month_spec.rb +67 -0
  41. data/spec/database/db_type_converter_spec.rb +125 -0
  42. data/spec/database/migration_file_writer_spec.rb +37 -0
  43. data/spec/database/schema_generator_spec.rb +199 -0
  44. data/spec/db_connections.yml.dist +4 -0
  45. data/spec/query_spec.rb +495 -0
  46. data/spec/schema/column_spec.rb +213 -0
  47. data/spec/schema/dimension_builder_spec.rb +32 -0
  48. data/spec/schema/dimension_reference_spec.rb +90 -0
  49. data/spec/schema/dimension_spec.rb +111 -0
  50. data/spec/schema/fact_spec.rb +83 -0
  51. data/spec/schema/measure_spec.rb +27 -0
  52. data/spec/schema/named_element_collection_spec.rb +67 -0
  53. data/spec/schema/pivoted_column_spec.rb +17 -0
  54. data/spec/schema/query_column_spec.rb +120 -0
  55. data/spec/spec_helper.rb +20 -0
  56. data/spec/star_schema_spec.rb +219 -0
  57. data/spec/support/matchers/be_one_of.rb +11 -0
  58. data/spec/support/matchers/column_matchers.rb +11 -0
  59. data/spec/support/shared_examples/column.rb +13 -0
  60. data/spec/support/shared_examples/schema_table.rb +17 -0
  61. data/spec/support/shared_examples/schema_visitor.rb +25 -0
  62. data/tasks/stats.rake +108 -0
  63. metadata +300 -0
@@ -0,0 +1,221 @@
1
+ require 'chicago/schema/named_element'
2
+
3
+ module Chicago
4
+ module Schema
5
+ # A column in a dimension or fact record.
6
+ #
7
+ # The column definition is used to generate the options
8
+ # to create the column in the database schema, but also
9
+ # to provide an abstract definition of the column for views
10
+ # and other Data Warehouse code.
11
+ #
12
+ # You shouldn't need to create a Column manually - they
13
+ # are generally defined using the schema definition DSL.
14
+ #
15
+ # @api public
16
+ class Column
17
+ include Schema::NamedElement
18
+
19
+ # Creates a new column definition.
20
+ #
21
+ # name:: the name of the column.
22
+ # column_type:: the abstract type of the column. For example, :string.
23
+ #
24
+ # Options:
25
+ #
26
+ # min:: the minimum length/number of this column.
27
+ # max:: the maximum length/number of this column.
28
+ # range:: any object with a min & max method - overrides min/max (above).
29
+ # null:: whether this column can be null. False by default.
30
+ # elements:: the allowed values this column can take.
31
+ # default:: the default value for this column.
32
+ # descriptive:: whether this column is purely descriptive and
33
+ # won't be used for grouping/filtering.
34
+ # internal:: the column is for internal use only, and shouldn't
35
+ # be displayed/used directly in a user context
36
+ # optional:: the column isn't expected to be populated.
37
+ #
38
+ # @api private
39
+ def initialize(name, column_type, opts={})
40
+ @opts = normalize_opts(column_type, opts)
41
+
42
+ super name, opts
43
+
44
+ @column_type = column_type
45
+ if @opts[:countable].kind_of?(String)
46
+ @countable_label = @opts[:countable]
47
+ end
48
+ @countable = !! @opts[:countable]
49
+ @min = @opts[:min]
50
+ @max = @opts[:max]
51
+ @null = @opts[:null]
52
+ @elements = @opts[:elements]
53
+ @default = @opts[:default]
54
+ @descriptive = !! @opts[:descriptive]
55
+ @internal = !! @opts[:internal]
56
+ @optional = !! (@opts.has_key?(:optional) ? @opts[:optional] : @opts[:null])
57
+ end
58
+
59
+ # Returns the type of this column. This is an abstract type,
60
+ # not a database type (for example :string, not :varchar).
61
+ attr_reader :column_type
62
+
63
+ # Returns the minimum value of this column, or nil.
64
+ attr_reader :min
65
+
66
+ # Returns the minimum value of this column, or nil.
67
+ attr_reader :max
68
+
69
+ # Returns an Array of allowed elements, or nil.
70
+ attr_reader :elements
71
+
72
+ # Returns the explicit default as set in the database, or nil.
73
+ attr_reader :default
74
+
75
+ # Returns the calculated default value.
76
+ #
77
+ # This may be different from the explicit default - for example
78
+ # a boolean not-null column that has no explicit default will
79
+ # have a default of nil, and a default value of false.
80
+ #
81
+ # The distinction is important, otherwise values can end up
82
+ # missing in junk dimensions due to differences between what
83
+ # ruby and the database considers unique.
84
+ def default_value
85
+ if @default || null?
86
+ @default
87
+ elsif @column_type == :boolean
88
+ false
89
+ elsif numeric?
90
+ 0
91
+ elsif textual?
92
+ ''
93
+ else
94
+ nil
95
+ end
96
+ end
97
+
98
+ attr_reader :countable_label
99
+
100
+ alias :key_name :name
101
+
102
+ # Returns true if this column can be counted.
103
+ def countable?
104
+ @countable
105
+ end
106
+
107
+ # Returns true if this column should be indexed
108
+ def indexed?
109
+ ! descriptive?
110
+ end
111
+
112
+ # Returns true if this column is optional.
113
+ #
114
+ # Will be defaulted from whether the column allows null values,
115
+ # can be overridden (for dates etc).
116
+ def optional?
117
+ @optional
118
+ end
119
+
120
+ # Returns true if this column should be ignored in user-facing
121
+ # parts of an application
122
+ def internal?
123
+ @internal
124
+ end
125
+
126
+ # Returns true if null values are allowed.
127
+ def null?
128
+ @null
129
+ end
130
+
131
+ # Returns true if this column is just informational, and is not
132
+ # intended to be used as a filter.
133
+ def descriptive?
134
+ @descriptive
135
+ end
136
+
137
+ # Returns true if both definition's attributes are equal.
138
+ def ==(other)
139
+ other.kind_of?(self.class) &&
140
+ name == other.name &&
141
+ column_type == other.column_type &&
142
+ @opts == other.instance_variable_get(:@opts)
143
+ end
144
+
145
+ # Returns true if this column stores a numeric value.
146
+ def numeric?
147
+ @numeric ||= [:integer, :money, :percent, :decimal, :float].include?(column_type)
148
+ end
149
+
150
+ # Returns true if the column stores a textual value.
151
+ def textual?
152
+ @textual ||= [:string, :text].include?(column_type)
153
+ end
154
+
155
+ def hash #:nodoc:
156
+ name.hash
157
+ end
158
+
159
+ # Returns a hash of column options.
160
+ def to_hash
161
+ db_schema = {
162
+ :name => name,
163
+ :column_type => column_type,
164
+ :null => null?
165
+ }
166
+ db_schema[:default] = default if default || column_type == :timestamp
167
+ db_schema[:elements] = elements if elements
168
+ db_schema[:size] = size if size
169
+ db_schema[:unsigned] = !! unsigned? if numeric?
170
+ db_schema
171
+ end
172
+
173
+ def qualify_by(table)
174
+ name.qualify(table)
175
+ end
176
+
177
+ # Columns accept Visitors
178
+ def visit(visitor)
179
+ visitor.visit_column(self)
180
+ end
181
+
182
+ private
183
+
184
+ def unsigned?
185
+ return @unsigned if defined? @unsigned
186
+ default_unsigned = column_type == :percent || column_type == :money
187
+ @unsigned = min ? min >= 0 : default_unsigned
188
+ end
189
+
190
+ def size
191
+ @size ||= if @opts[:size]
192
+ @opts[:size]
193
+ elsif max && column_type == :string
194
+ max
195
+ elsif column_type == :money
196
+ [12,2]
197
+ elsif column_type == :percent
198
+ [6,3]
199
+ end
200
+ end
201
+
202
+ def normalize_opts(type, opts)
203
+ opts = {:null => default_null(type), :min => default_min(type)}.merge(opts)
204
+ if opts[:range]
205
+ opts[:min] = opts[:range].min
206
+ opts[:max] = opts[:range].max
207
+ opts.delete(:range)
208
+ end
209
+ opts
210
+ end
211
+
212
+ def default_null(type)
213
+ [:date, :timestamp, :datetime].include?(type)
214
+ end
215
+
216
+ def default_min(type)
217
+ 0 if type == :money
218
+ end
219
+ end
220
+ end
221
+ end
@@ -0,0 +1,127 @@
1
+ require 'chicago/schema/query_column'
2
+
3
+ module Chicago
4
+ module Schema
5
+ # Parses AST column representations, returning an Array of
6
+ # QueryColumns.
7
+ #
8
+ # Columns can be simple dotted references, like
9
+ #
10
+ # "sales.product.name"
11
+ #
12
+ # calculations like:
13
+ #
14
+ # {:column => "sales.total", :op => "sum"}
15
+ #
16
+ # or pivoted calculations like:
17
+ #
18
+ # {:column => "sales.total",
19
+ # :op => "sum"
20
+ # :pivot => "sales.date.year"}
21
+ #
22
+ class ColumnParser
23
+ # Creates a new ColumnParser for a schema.
24
+ def initialize(schema)
25
+ @schema = schema
26
+ end
27
+
28
+ # Parses a column element. An element may be a string reference
29
+ # like "foo.bar", or more complicated like {:column =>
30
+ # "foo.bar", :op => "sum"}
31
+ #
32
+ # @return [Array<Column>] an array of columns. In most cases
33
+ # this will be a 1-element array, unless the column is
34
+ # pivoted.
35
+ def parse(elem)
36
+ [_parse(elem)].flatten
37
+ end
38
+
39
+ protected
40
+
41
+ # Returns an Array of values, given a column to pivot with.
42
+ #
43
+ # May be overridden by subclasses.
44
+ #
45
+ # @raise UnimplementedError if a column with unknown or too many
46
+ # elements is used as a pivot column. In future this
47
+ # restriction may be lifted.
48
+ def pivotable_elements(pivot_col)
49
+ if pivot_col.column_type == :boolean
50
+ [true, false]
51
+ elsif pivot_col.elements
52
+ pivot_col.elements
53
+ elsif has_pivotable_integer_range?(pivot_col)
54
+ (pivot_col.min..pivot_col.max).to_a
55
+ else
56
+ raise UnimplementedError.new("General pivoting not yet support")
57
+ end
58
+ end
59
+
60
+ # Returns true if an Integer column can be used as pivot column.
61
+ #
62
+ # Default is to allow columns with a range 500 wide or less to
63
+ # be used as pivot columns.
64
+ #
65
+ # May be overridden by subclasses
66
+ #
67
+ # @return Boolean true if this column can be pivoted.
68
+ def has_pivotable_integer_range?(pivot_col)
69
+ pivot_col.column_type == :integer &&
70
+ pivot_col.max &&
71
+ pivot_col.min &&
72
+ (pivot_col.max - pivot_col.min <= 500)
73
+ end
74
+
75
+ private
76
+
77
+ def _parse(elem)
78
+ elem.kind_of?(Hash) ? complex_column(elem) : simple_column(elem)
79
+ end
80
+
81
+ def complex_column(elem)
82
+ elem.symbolize_keys!
83
+ elem[:pivot] ? pivoted_column(elem) : calculated_column(elem)
84
+ end
85
+
86
+ def pivoted_column(elem)
87
+ pivoted_column = _parse(elem[:column])
88
+ pivoted_by = _parse(elem[:pivot])
89
+ unit = [:avg, :count].include?(elem[:op].to_sym) ? nil : 0
90
+ pivoted_column.pivot(pivoted_by, pivotable_elements(pivoted_by), unit).map do |c|
91
+ c.calculate(elem[:op].to_sym)
92
+ end
93
+ end
94
+
95
+ def simple_column(elem)
96
+ table, col = parse_parts(elem)
97
+ QueryColumn.column(table, col, elem.to_sym)
98
+ end
99
+
100
+ def calculated_column(elem)
101
+ col = _parse(elem[:column])
102
+ elem[:op] ? col.calculate(elem[:op].to_sym) : col
103
+ end
104
+
105
+ def parse_parts(str)
106
+ table, parts = parse_table(str)
107
+ col = table[parts.shift]
108
+ # To cope with bare dimension references.
109
+ col = table.original_key if col.nil?
110
+
111
+ if col.kind_of?(Chicago::Schema::Dimension)
112
+ table = col
113
+ col = parts.empty? ? table : table[parts.first]
114
+ end
115
+
116
+ [table, col]
117
+ end
118
+
119
+ def parse_table(str)
120
+ parts = str.split('.').map(&:to_sym)
121
+ root = parts.shift
122
+ table = @schema.fact(root) || @schema.dimension(root)
123
+ [table, parts]
124
+ end
125
+ end
126
+ end
127
+ end
@@ -0,0 +1,129 @@
1
+ require 'chicago/schema/table'
2
+
3
+ module Chicago
4
+ module Schema
5
+ # The conventional format for dimension table names
6
+ DIMENSION_TABLE_FORMAT = "dimension_%s".freeze
7
+
8
+ # The conventional format for key table names.
9
+ KEY_TABLE_FORMAT = "keys_%s".freeze
10
+
11
+ # A dimension in the star schema.
12
+ #
13
+ # Dimensions contain denormalized values from various source
14
+ # systems, and are used to group and filter the fact tables. They
15
+ # may also be queried themselves.
16
+ #
17
+ # You shouldn't need to initialize a Dimension yourself - they
18
+ # should be created via StarSchema#define_dimension.
19
+ #
20
+ # @api public
21
+ class Dimension < Table
22
+ # Returns an array of Columns defined on this dimension.
23
+ #
24
+ # @see Chicago::Schema::Column.
25
+ attr_reader :columns
26
+
27
+ # @deprecated Use columns instead.
28
+ alias :column_definitions :columns
29
+
30
+ # Returns all the human-friendly identifying columns for this
31
+ # dimension.
32
+ #
33
+ # There is no expectation that identifying values will be unique,
34
+ # but they are intended to identify a single record in a user
35
+ # friendly way.
36
+ attr_reader :identifiers
37
+
38
+ # The table used to generate/store dimension keys.
39
+ attr_reader :key_table_name
40
+
41
+ # Creates a new Dimension, named +name+.
42
+ #
43
+ # @param name the name of the dimension
44
+ # @option opts [Array] columns
45
+ # @option opts [Array] identifiers
46
+ # @option opts [Array] null_records an array of attribute
47
+ # hashes, used to create null record rows in the database.
48
+ # Hashes must have an :id key.
49
+ # @option opts [Array<Symbol>] natual_key an array of symbols,
50
+ # representing a uniqueness constraint on the dimension.
51
+ # @option opts description a long text description about the dimension.
52
+ # @raise [Chicago::UnsafeNullRecordError]
53
+ def initialize(name, opts={})
54
+ super
55
+ @columns = opts[:columns] || []
56
+ @identifiers = opts[:identifiers] || []
57
+ @null_records = opts[:null_records] || []
58
+ @table_name = sprintf(DIMENSION_TABLE_FORMAT, name).to_sym
59
+ @key_table_name = sprintf(KEY_TABLE_FORMAT, @table_name).to_sym
60
+ @predetermined_values = !! opts[:predetermined_values]
61
+ check_null_records
62
+ end
63
+
64
+ # Creates null records in a Database.
65
+ #
66
+ # This will overwrite any records that share the id with the
67
+ # null record, so be careful.
68
+ #
69
+ # Optionally provide an overridden table name, if you need to
70
+ # create null records for a temporary version of the table.
71
+ def create_null_records(db, overridden_table_name=nil)
72
+ table_to_populate = overridden_table_name || table_name
73
+ unless @null_records.empty?
74
+ db[table_to_populate].insert_replace.
75
+ insert_multiple(@null_records)
76
+ if db.table_exists?(key_table_name)
77
+ ids = @null_records.map {|r| {:dimension_id => r[:id]} }
78
+ db[key_table_name].insert_replace.insert_multiple(ids)
79
+ end
80
+ end
81
+ end
82
+
83
+ # Returns the main identifier for this record.
84
+ def main_identifier
85
+ @identifiers.first
86
+ end
87
+
88
+ # Returns true if this dimension can be identified as a concrete
89
+ # entity, with an original_id from a source system.
90
+ #
91
+ # @todo change to be consistent with identifiers
92
+ def identifiable?
93
+ !! original_key
94
+ end
95
+
96
+ # Returns true if the set of values for this dimension is
97
+ # pretermined.
98
+ #
99
+ # Examples of this may be date dimensions, currency dimensions
100
+ # etc.
101
+ def has_predetermined_values?
102
+ @predetermined_values
103
+ end
104
+
105
+ # Returns the column that represents the id in the original
106
+ # source for the dimension.
107
+ #
108
+ # Currently this column *must* be called +original_id+
109
+ #
110
+ # @todo make configurable.
111
+ def original_key
112
+ @original_key ||= @columns.detect {|c| c.name == :original_id }
113
+ end
114
+
115
+ # Dimensions accept Visitors
116
+ def visit(visitor)
117
+ visitor.visit_dimension(self)
118
+ end
119
+
120
+ private
121
+
122
+ def check_null_records
123
+ unless @null_records.all? {|h| h[:id] }
124
+ raise UnsafeNullRecordError.new "Null record defined without id field for dimension #{name}"
125
+ end
126
+ end
127
+ end
128
+ end
129
+ end