purview 1.0.0.alpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (83) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +33 -0
  3. data/.travis.yml +18 -0
  4. data/Gemfile +3 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +143 -0
  7. data/Rakefile +11 -0
  8. data/TODO +81 -0
  9. data/lib/purview/columns/base.rb +65 -0
  10. data/lib/purview/columns/boolean.rb +11 -0
  11. data/lib/purview/columns/created_timestamp.rb +11 -0
  12. data/lib/purview/columns/date.rb +11 -0
  13. data/lib/purview/columns/float.rb +11 -0
  14. data/lib/purview/columns/id.rb +11 -0
  15. data/lib/purview/columns/integer.rb +11 -0
  16. data/lib/purview/columns/money.rb +11 -0
  17. data/lib/purview/columns/string.rb +11 -0
  18. data/lib/purview/columns/text.rb +11 -0
  19. data/lib/purview/columns/time.rb +11 -0
  20. data/lib/purview/columns/timestamp.rb +11 -0
  21. data/lib/purview/columns/updated_timestamp.rb +11 -0
  22. data/lib/purview/columns/uuid.rb +11 -0
  23. data/lib/purview/columns.rb +14 -0
  24. data/lib/purview/connections/base.rb +55 -0
  25. data/lib/purview/connections/mysql.rb +39 -0
  26. data/lib/purview/connections/postgresql.rb +27 -0
  27. data/lib/purview/connections.rb +3 -0
  28. data/lib/purview/databases/base.rb +559 -0
  29. data/lib/purview/databases/mysql.rb +207 -0
  30. data/lib/purview/databases/postgresql.rb +210 -0
  31. data/lib/purview/databases.rb +3 -0
  32. data/lib/purview/exceptions/base.rb +5 -0
  33. data/lib/purview/exceptions/could_not_acquire_lock.rb +9 -0
  34. data/lib/purview/exceptions/lock_already_released.rb +9 -0
  35. data/lib/purview/exceptions/no_table.rb +9 -0
  36. data/lib/purview/exceptions/no_window.rb +9 -0
  37. data/lib/purview/exceptions/rows_outside_window.rb +18 -0
  38. data/lib/purview/exceptions/table.rb +13 -0
  39. data/lib/purview/exceptions.rb +7 -0
  40. data/lib/purview/loaders/base.rb +154 -0
  41. data/lib/purview/loaders/mysql.rb +81 -0
  42. data/lib/purview/loaders/postgresql.rb +81 -0
  43. data/lib/purview/loaders.rb +3 -0
  44. data/lib/purview/loggers/base.rb +99 -0
  45. data/lib/purview/loggers/console.rb +11 -0
  46. data/lib/purview/loggers.rb +2 -0
  47. data/lib/purview/mixins/helpers.rb +21 -0
  48. data/lib/purview/mixins/logger.rb +21 -0
  49. data/lib/purview/mixins.rb +2 -0
  50. data/lib/purview/parsers/base.rb +39 -0
  51. data/lib/purview/parsers/csv.rb +49 -0
  52. data/lib/purview/parsers/tsv.rb +11 -0
  53. data/lib/purview/parsers.rb +3 -0
  54. data/lib/purview/pullers/base.rb +19 -0
  55. data/lib/purview/pullers/uri.rb +66 -0
  56. data/lib/purview/pullers.rb +2 -0
  57. data/lib/purview/refinements/object.rb +5 -0
  58. data/lib/purview/refinements/time.rb +5 -0
  59. data/lib/purview/refinements.rb +2 -0
  60. data/lib/purview/structs/base.rb +10 -0
  61. data/lib/purview/structs/result.rb +7 -0
  62. data/lib/purview/structs/window.rb +7 -0
  63. data/lib/purview/structs.rb +3 -0
  64. data/lib/purview/tables/base.rb +140 -0
  65. data/lib/purview/tables/raw.rb +13 -0
  66. data/lib/purview/tables.rb +2 -0
  67. data/lib/purview/types/base.rb +9 -0
  68. data/lib/purview/types/boolean.rb +9 -0
  69. data/lib/purview/types/date.rb +9 -0
  70. data/lib/purview/types/float.rb +9 -0
  71. data/lib/purview/types/integer.rb +9 -0
  72. data/lib/purview/types/money.rb +9 -0
  73. data/lib/purview/types/string.rb +9 -0
  74. data/lib/purview/types/text.rb +9 -0
  75. data/lib/purview/types/time.rb +9 -0
  76. data/lib/purview/types/timestamp.rb +9 -0
  77. data/lib/purview/types/uuid.rb +9 -0
  78. data/lib/purview/types.rb +11 -0
  79. data/lib/purview/version.rb +3 -0
  80. data/lib/purview.rb +27 -0
  81. data/purview.gemspec +29 -0
  82. data/spec/spec_helper.rb +5 -0
  83. metadata +210 -0
@@ -0,0 +1,81 @@
1
+ module Purview
2
+ module Loaders
3
+ class MySQL < Base
4
+ private
5
+
6
+ def id_in_sql(temporary_table_name)
7
+ 'SELECT %s FROM %s' % [
8
+ table.id_column.name,
9
+ temporary_table_name,
10
+ ]
11
+ end
12
+
13
+ def in_window_sql(window)
14
+ '%s BETWEEN %s AND %s' % [
15
+ table.updated_timestamp_column.name,
16
+ quoted(window.min),
17
+ quoted(window.max),
18
+ ]
19
+ end
20
+
21
+ def not_in_window_sql(window)
22
+ '%s NOT BETWEEN %s AND %s' % [
23
+ table.updated_timestamp_column.name,
24
+ quoted(window.min),
25
+ quoted(window.max),
26
+ ]
27
+ end
28
+
29
+ def table_delete_sql(window, temporary_table_name)
30
+ 'DELETE FROM %s WHERE %s AND %s NOT IN (%s)' % [
31
+ table.name,
32
+ in_window_sql(window),
33
+ table.id_column.name,
34
+ id_in_sql(temporary_table_name),
35
+ ]
36
+ end
37
+
38
+ def table_insert_sql(window, temporary_table_name)
39
+ 'INSERT INTO %s (%s) SELECT %s FROM %s t1 WHERE NOT EXISTS (SELECT 1 FROM %s t2 WHERE t1.%s = t2.%s)' % [
40
+ table.name,
41
+ table.column_names.join(', '),
42
+ table.column_names.join(', '),
43
+ temporary_table_name,
44
+ table.name,
45
+ table.id_column.name,
46
+ table.id_column.name,
47
+ ]
48
+ end
49
+
50
+ def table_update_sql(window, temporary_table_name)
51
+ 'UPDATE %s t1 JOIN %s t2 ON t1.%s = t2.%s SET %s' % [
52
+ table.name,
53
+ temporary_table_name,
54
+ table.id_column.name,
55
+ table.id_column.name,
56
+ table.column_names.map { |column_name| "t1.#{column_name} = t2.#{column_name}" }.join(', '),
57
+ ]
58
+ end
59
+
60
+ def temporary_table_insert_sql(temporary_table_name, rows)
61
+ 'INSERT INTO %s (%s) VALUES %s' % [
62
+ temporary_table_name,
63
+ table.column_names.join(', '),
64
+ rows.map { |row| "(#{row_values(row)})" }.join(', ')
65
+ ]
66
+ end
67
+
68
+ def temporary_table_opts
69
+ super.merge(:create_indices => false)
70
+ end
71
+
72
+ def temporary_table_verify_sql(temporary_table_name, rows, window)
73
+ 'SELECT COUNT(1) %s FROM %s WHERE %s' % [
74
+ count_column_name,
75
+ temporary_table_name,
76
+ not_in_window_sql(window),
77
+ ]
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,81 @@
1
+ module Purview
2
+ module Loaders
3
+ class PostgreSQL < Base
4
+ private
5
+
6
+ def id_in_sql(temporary_table_name)
7
+ 'SELECT %s FROM %s' % [
8
+ table.id_column.name,
9
+ temporary_table_name,
10
+ ]
11
+ end
12
+
13
+ def in_window_sql(window)
14
+ '%s BETWEEN %s AND %s' % [
15
+ table.updated_timestamp_column.name,
16
+ quoted(window.min),
17
+ quoted(window.max),
18
+ ]
19
+ end
20
+
21
+ def not_in_window_sql(window)
22
+ '%s NOT BETWEEN %s AND %s' % [
23
+ table.updated_timestamp_column.name,
24
+ quoted(window.min),
25
+ quoted(window.max),
26
+ ]
27
+ end
28
+
29
+ def table_delete_sql(window, temporary_table_name)
30
+ 'DELETE FROM %s WHERE %s AND %s NOT IN (%s)' % [
31
+ table.name,
32
+ in_window_sql(window),
33
+ table.id_column.name,
34
+ id_in_sql(temporary_table_name),
35
+ ]
36
+ end
37
+
38
+ def table_insert_sql(window, temporary_table_name)
39
+ 'INSERT INTO %s (%s) SELECT %s FROM %s t1 WHERE NOT EXISTS (SELECT 1 FROM %s t2 WHERE t1.%s = t2.%s)' % [
40
+ table.name,
41
+ table.column_names.join(', '),
42
+ table.column_names.join(', '),
43
+ temporary_table_name,
44
+ table.name,
45
+ table.id_column.name,
46
+ table.id_column.name,
47
+ ]
48
+ end
49
+
50
+ def table_update_sql(window, temporary_table_name)
51
+ 'UPDATE %s t1 SET %s FROM %s t2 WHERE t1.%s = t2.%s' % [
52
+ table.name,
53
+ table.column_names.map { |column_name| "#{column_name} = t2.#{column_name}" }.join(', '),
54
+ temporary_table_name,
55
+ table.id_column.name,
56
+ table.id_column.name,
57
+ ]
58
+ end
59
+
60
+ def temporary_table_insert_sql(temporary_table_name, rows)
61
+ 'INSERT INTO %s (%s) VALUES %s' % [
62
+ temporary_table_name,
63
+ table.column_names.join(', '),
64
+ rows.map { |row| "(#{row_values(row)})" }.join(', ')
65
+ ]
66
+ end
67
+
68
+ def temporary_table_opts
69
+ super.merge(:create_indices => false)
70
+ end
71
+
72
+ def temporary_table_verify_sql(temporary_table_name, rows, window)
73
+ 'SELECT COUNT(1) %s FROM %s WHERE %s' % [
74
+ count_column_name,
75
+ temporary_table_name,
76
+ not_in_window_sql(window),
77
+ ]
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,3 @@
1
+ require 'purview/loaders/base'
2
+ require 'purview/loaders/mysql'
3
+ require 'purview/loaders/postgresql'
@@ -0,0 +1,99 @@
1
+ module Purview
2
+ module Loggers
3
+ class Base
4
+ def initialize(opts={})
5
+ @opts = default_opts.merge(opts)
6
+ end
7
+
8
+ def debug(*args)
9
+ log(DEBUG_LEVEL, *args) if debug?
10
+ end
11
+
12
+ def error(*args)
13
+ log(ERROR_LEVEL, *args) if error?
14
+ end
15
+
16
+ def info(*args)
17
+ log(INFO_LEVEL, *args) if info?
18
+ end
19
+
20
+ def with_context_logging(*args)
21
+ debug(build_starting_message(*args))
22
+ yield.tap { |result| debug(build_finished_message(*args)) }
23
+ end
24
+
25
+ private
26
+
27
+ DEBUG_LEVEL = 'DEBUG'
28
+ ERROR_LEVEL = 'ERROR'
29
+ INFO_LEVEL = 'INFO'
30
+
31
+ attr_reader :opts
32
+
33
+ def build_finished_message(*args)
34
+ case args.length
35
+ when 1; "Finished #{args[0]}"
36
+ when 2; args[-1]
37
+ else; raise
38
+ end
39
+ end
40
+
41
+ def build_message(level, *args)
42
+ message, exception = args[0..1]
43
+ message_template(!!exception) % {
44
+ :exception => format_exception(exception),
45
+ :level => level,
46
+ :message => message,
47
+ :process_id => Process.pid,
48
+ :timestamp => Time.now.strftime('%Y-%m-%d %H:%M:%S.%L %z'),
49
+ }
50
+ end
51
+
52
+ def build_starting_message(*args)
53
+ case args.length
54
+ when 1; "Starting #{args[0]}"
55
+ when 2; args[0]
56
+ else; raise
57
+ end
58
+ end
59
+
60
+ def debug?
61
+ !!opts[:debug]
62
+ end
63
+
64
+ def default_opts
65
+ {
66
+ :debug => true,
67
+ :error => true,
68
+ :info => false,
69
+ }
70
+ end
71
+
72
+ def error?
73
+ !!opts[:error]
74
+ end
75
+
76
+ def format_exception(exception)
77
+ exception && exception.backtrace.map { |line| "\tfrom #{line}" }.join("\n")
78
+ end
79
+
80
+ def info?
81
+ !!opts[:info]
82
+ end
83
+
84
+ def log(level, *args)
85
+ stream.puts build_message(level, *args)
86
+ end
87
+
88
+ def message_template(exception)
89
+ "%{timestamp} %{level} (%{process_id}) %{message}".tap do |result|
90
+ result << ":\n%{exception}" if exception
91
+ end
92
+ end
93
+
94
+ def stream
95
+ opts[:stream]
96
+ end
97
+ end
98
+ end
99
+ end
@@ -0,0 +1,11 @@
1
+ module Purview
2
+ module Loggers
3
+ class Console < Base
4
+ private
5
+
6
+ def default_opts
7
+ super.merge(:stream => STDOUT)
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,2 @@
1
+ require 'purview/loggers/base'
2
+ require 'purview/loggers/console'
@@ -0,0 +1,21 @@
1
+ module Purview
2
+ module Mixins
3
+ module Helpers
4
+ def blank?(value)
5
+ value.to_s.strip.length.zero?
6
+ end
7
+
8
+ def coalesce(value, default)
9
+ value.nil? ? default : value
10
+ end
11
+
12
+ def present?(value)
13
+ !blank?(value)
14
+ end
15
+
16
+ def zero?(value)
17
+ Integer(value).zero?
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,21 @@
1
+ module Purview
2
+ module Mixins
3
+ module Logger
4
+ def logger
5
+ @logger ||= logger_type.new(logger_opts)
6
+ end
7
+
8
+ def logger_opts
9
+ opts[:logger] || {}
10
+ end
11
+
12
+ def logger_type
13
+ opts[:logger_type] || Purview::Loggers::Console
14
+ end
15
+
16
+ def with_context_logging(*args)
17
+ logger.with_context_logging(*args) { yield }
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,2 @@
1
+ require 'purview/mixins/helpers'
2
+ require 'purview/mixins/logger'
@@ -0,0 +1,39 @@
1
+ module Purview
2
+ module Parsers
3
+ class Base
4
+ def initialize(opts={})
5
+ @opts = opts
6
+ end
7
+
8
+ def parse(data)
9
+ raise %{All "#{Base}(s)" must override the "parse" method}
10
+ end
11
+
12
+ def validate(data)
13
+ true
14
+ end
15
+
16
+ private
17
+
18
+ include Purview::Mixins::Logger
19
+
20
+ attr_reader :opts
21
+
22
+ def build_result(row)
23
+ {}.tap do |result|
24
+ row.each do |key, value|
25
+ if column = table.columns_by_name[key]
26
+ result[key] = column.parse(value)
27
+ else
28
+ logger.debug(%{Unexpected column: "#{key}" in data-set})
29
+ end
30
+ end
31
+ end
32
+ end
33
+
34
+ def table
35
+ opts[:table]
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,49 @@
1
+ module Purview
2
+ module Parsers
3
+ class CSV < Base
4
+ def parse(data)
5
+ with_context_logging("`parse` for: #{table.name}") do
6
+ [].tap do |results|
7
+ headers = extract_headers(data)
8
+ extract_rows(data) do |row|
9
+ results << build_result(headers.zip(row))
10
+ end
11
+ end
12
+ end
13
+ end
14
+
15
+ def validate(data)
16
+ with_context_logging("`validate` for: #{table.name}") do
17
+ missing_columns = table.column_names - extract_headers(data)
18
+ raise 'Missing one or more columns: "%s"' % missing_columns.join('", "') \
19
+ unless missing_columns.empty?
20
+ true
21
+ end
22
+ end
23
+
24
+ private
25
+
26
+ def column_separator
27
+ ','
28
+ end
29
+
30
+ def extract_headers(data)
31
+ header_row = data.split(row_separator).first
32
+ parse_row(header_row).map(&:to_sym)
33
+ end
34
+
35
+ def extract_rows(data)
36
+ rows = data.split(row_separator)[1..-1]
37
+ rows.each { |row| yield parse_row(row) }
38
+ end
39
+
40
+ def parse_row(row)
41
+ ::CSV.parse(row, :col_sep => column_separator).first
42
+ end
43
+
44
+ def row_separator
45
+ $/
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,11 @@
1
+ module Purview
2
+ module Parsers
3
+ class TSV < CSV
4
+ private
5
+
6
+ def column_separator
7
+ "\t"
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,3 @@
1
+ require 'purview/parsers/base'
2
+ require 'purview/parsers/csv'
3
+ require 'purview/parsers/tsv'
@@ -0,0 +1,19 @@
1
+ module Purview
2
+ module Pullers
3
+ class Base
4
+ def initialize(opts={})
5
+ @opts = opts
6
+ end
7
+
8
+ def pull(window)
9
+ raise %{All "#{Base}(s)" must override the "pull" method}
10
+ end
11
+
12
+ private
13
+
14
+ include Purview::Mixins::Logger
15
+
16
+ attr_reader :opts
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,66 @@
1
+ module Purview
2
+ module Pullers
3
+ class URI < Base
4
+ def pull(window)
5
+ request = windowed_request(window)
6
+ with_context_logging("`pull` from: #{request.path}") do
7
+ http.request(request).body
8
+ end
9
+ end
10
+
11
+ private
12
+
13
+ def basic_auth?
14
+ username && password
15
+ end
16
+
17
+ def host
18
+ uri.host
19
+ end
20
+
21
+ def http
22
+ Net::HTTP.new(host, port).tap do |http|
23
+ if https?
24
+ http.use_ssl = true
25
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
26
+ end
27
+ end
28
+ end
29
+
30
+ def https?
31
+ uri.scheme == 'https'
32
+ end
33
+
34
+ def password
35
+ opts[:password]
36
+ end
37
+
38
+ def port
39
+ uri.port
40
+ end
41
+
42
+ def uri
43
+ ::URI.parse(opts[:uri])
44
+ end
45
+
46
+ def username
47
+ opts[:username]
48
+ end
49
+
50
+ def windowed_request(window)
51
+ Net::HTTP::Get.new(windowed_request_uri(window)).tap do |request|
52
+ if basic_auth?
53
+ request.basic_auth(username, password)
54
+ end
55
+ end
56
+ end
57
+
58
+ def windowed_request_uri(window)
59
+ uri.to_s.tap do |request_uri|
60
+ request_uri << (request_uri.include?('?') ? '&' : '?')
61
+ request_uri << 'ts1=%s&ts2=%s' % [window.min.to_i, window.max.to_i]
62
+ end
63
+ end
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,2 @@
1
+ require 'purview/pullers/base'
2
+ require 'purview/pullers/uri'
@@ -0,0 +1,5 @@
1
+ class Object
2
+ def quoted
3
+ "'#{self}'"
4
+ end
5
+ end
@@ -0,0 +1,5 @@
1
+ class Time
2
+ def quoted
3
+ "'#{self.strftime('%F %T')}'"
4
+ end
5
+ end
@@ -0,0 +1,2 @@
1
+ require 'purview/refinements/object'
2
+ require 'purview/refinements/time'
@@ -0,0 +1,10 @@
1
+ module Purview
2
+ module Structs
3
+ class Base < OpenStruct
4
+ def method_missing(method_name, *args, &block)
5
+ raise NoMethodError if args.size.zero?
6
+ super
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,7 @@
1
+ module Purview
2
+ module Structs
3
+ class Result < Base
4
+ # Helper methods/overrides
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,7 @@
1
+ module Purview
2
+ module Structs
3
+ class Window < Base
4
+ # Helper methods/overrides
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,3 @@
1
+ require 'purview/structs/base'
2
+ require 'purview/structs/result'
3
+ require 'purview/structs/window'