infobright-loader 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (94) hide show
  1. data/.gitignore +17 -0
  2. data/Gemfile +19 -0
  3. data/LICENSE-2.0.txt +202 -0
  4. data/README.md +232 -0
  5. data/Rakefile +2 -0
  6. data/bin/infobright-loader +45 -0
  7. data/control-file/template.yml +23 -0
  8. data/infobright-loader.gemspec +35 -0
  9. data/lib/infobright-loader.rb +5 -0
  10. data/lib/infobright-loader/cli/config.rb +204 -0
  11. data/lib/infobright-loader/cli/loader.rb +66 -0
  12. data/lib/infobright-loader/db.rb +90 -0
  13. data/lib/infobright-loader/loader.rb +175 -0
  14. data/lib/infobright-loader/version.rb +4 -0
  15. data/tests/manual/bad-files/.gitignore +1 -0
  16. data/tests/manual/bad-files/control-file-raw.yml +50 -0
  17. data/tests/manual/bad-files/data/b/b_1.txt +6 -0
  18. data/tests/manual/bad-files/data/b/b_2.txt +6 -0
  19. data/tests/manual/bad-files/data/b/b_3.txt +6 -0
  20. data/tests/manual/bad-files/data/b/b_4.txt +6 -0
  21. data/tests/manual/bad-files/data/b/b_5.txt +6 -0
  22. data/tests/manual/bad-files/data/b/b_6.txt +6 -0
  23. data/tests/manual/bad-files/data/c/c_1.txt +6 -0
  24. data/tests/manual/bad-files/data/c/c_2.txt +6 -0
  25. data/tests/manual/bad-files/data/c/c_3.txt +6 -0
  26. data/tests/manual/bad-files/data/c/c_4.txt +6 -0
  27. data/tests/manual/bad-files/data/c/c_5.txt +6 -0
  28. data/tests/manual/bad-files/data/c/c_6.txt +6 -0
  29. data/tests/manual/bad-files/data/d/d_1.txt +6 -0
  30. data/tests/manual/bad-files/data/d/d_2.txt +6 -0
  31. data/tests/manual/bad-files/data/d/d_3.txt +6 -0
  32. data/tests/manual/bad-files/data/d/d_4.txt +6 -0
  33. data/tests/manual/bad-files/data/d/d_5.txt +6 -0
  34. data/tests/manual/bad-files/data/d/d_6.txt +6 -0
  35. data/tests/manual/bad-files/data/e/e_1.txt +6 -0
  36. data/tests/manual/bad-files/data/e/e_2.txt +6 -0
  37. data/tests/manual/bad-files/data/e/e_3.txt +6 -0
  38. data/tests/manual/bad-files/data/e/e_4.txt +6 -0
  39. data/tests/manual/bad-files/data/e/e_5.txt +6 -0
  40. data/tests/manual/bad-files/data/e/e_6.txt +6 -0
  41. data/tests/manual/bad-files/data/f/f_1.txt +6 -0
  42. data/tests/manual/bad-files/data/f/f_2.txt +6 -0
  43. data/tests/manual/bad-files/data/f/f_3.txt +6 -0
  44. data/tests/manual/bad-files/data/f/f_4.txt +6 -0
  45. data/tests/manual/bad-files/data/f/f_5.txt +6 -0
  46. data/tests/manual/bad-files/data/f/f_6.txt +6 -0
  47. data/tests/manual/bad-files/run_test.sh +40 -0
  48. data/tests/manual/bad-files/setup.sql +8 -0
  49. data/tests/manual/bad-files/verify.sql +10 -0
  50. data/tests/manual/control-file/.gitignore +1 -0
  51. data/tests/manual/control-file/control-file-raw.yml +50 -0
  52. data/tests/manual/control-file/data/b/b_1.txt +6 -0
  53. data/tests/manual/control-file/data/b/b_2.txt +6 -0
  54. data/tests/manual/control-file/data/b/b_3.txt +6 -0
  55. data/tests/manual/control-file/data/b/b_4.txt +6 -0
  56. data/tests/manual/control-file/data/b/b_5.txt +6 -0
  57. data/tests/manual/control-file/data/b/b_6.txt +6 -0
  58. data/tests/manual/control-file/data/c/c_1.txt +6 -0
  59. data/tests/manual/control-file/data/c/c_2.txt +6 -0
  60. data/tests/manual/control-file/data/c/c_3.txt +6 -0
  61. data/tests/manual/control-file/data/c/c_4.txt +6 -0
  62. data/tests/manual/control-file/data/c/c_5.txt +6 -0
  63. data/tests/manual/control-file/data/c/c_6.txt +6 -0
  64. data/tests/manual/control-file/data/d/d_1.txt +6 -0
  65. data/tests/manual/control-file/data/d/d_2.txt +6 -0
  66. data/tests/manual/control-file/data/d/d_3.txt +6 -0
  67. data/tests/manual/control-file/data/d/d_4.txt +6 -0
  68. data/tests/manual/control-file/data/d/d_5.txt +6 -0
  69. data/tests/manual/control-file/data/d/d_6.txt +6 -0
  70. data/tests/manual/control-file/data/e/e_1.txt +6 -0
  71. data/tests/manual/control-file/data/e/e_2.txt +6 -0
  72. data/tests/manual/control-file/data/e/e_3.txt +6 -0
  73. data/tests/manual/control-file/data/e/e_4.txt +6 -0
  74. data/tests/manual/control-file/data/e/e_5.txt +6 -0
  75. data/tests/manual/control-file/data/e/e_6.txt +6 -0
  76. data/tests/manual/control-file/data/f/f_1.txt +6 -0
  77. data/tests/manual/control-file/data/f/f_2.txt +6 -0
  78. data/tests/manual/control-file/data/f/f_3.txt +6 -0
  79. data/tests/manual/control-file/data/f/f_4.txt +6 -0
  80. data/tests/manual/control-file/data/f/f_5.txt +6 -0
  81. data/tests/manual/control-file/data/f/f_6.txt +6 -0
  82. data/tests/manual/control-file/run_test.sh +40 -0
  83. data/tests/manual/control-file/setup.sql +8 -0
  84. data/tests/manual/control-file/verify.sql +10 -0
  85. data/tests/manual/folder/data/a/a_1.txt +6 -0
  86. data/tests/manual/folder/data/a/a_2.txt +6 -0
  87. data/tests/manual/folder/data/a/a_3.txt +6 -0
  88. data/tests/manual/folder/data/a/a_4.txt +6 -0
  89. data/tests/manual/folder/data/a/a_5.txt +6 -0
  90. data/tests/manual/folder/data/a/a_6.txt +6 -0
  91. data/tests/manual/folder/run_test.sh +37 -0
  92. data/tests/manual/folder/setup.sql +4 -0
  93. data/tests/manual/folder/verify.sql +2 -0
  94. metadata +158 -0
@@ -0,0 +1,175 @@
1
+ # Copyright (c) 2012 SnowPlow Analytics Ltd. All rights reserved.
2
+ #
3
+ # This program is licensed to you under the Apache License Version 2.0,
4
+ # and you may not use this file except in compliance with the Apache License Version 2.0.
5
+ # You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
6
+ #
7
+ # Unless required by applicable law or agreed to in writing,
8
+ # software distributed under the Apache License Version 2.0 is distributed on an
9
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
11
+
12
+ # Author:: Alex Dean (mailto:support@snowplowanalytics.com)
13
+ # Copyright:: Copyright (c) 2012 SnowPlow Analytics Ltd
14
+ # License:: Apache License Version 2.0
15
+
16
+ require 'thread'
17
+
18
+ require 'infobright-loader/db'
19
+
20
+ module InfobrightLoader
21
+ module Loader
22
+
23
+ # For errors
24
+ class LoadError < ArgumentError; end
25
+
26
+ # Load a single table in Infobright with
27
+ # the contents of a single folder
28
+ def load_from_folder(folder, table, db, separator='|', encloser='')
29
+
30
+ # Let's loop through and grab all absolute paths to all the files in this folder, recursively
31
+ load_hash = {}
32
+ load_hash[table] = Dir["#{folder}**/*"].find_all{|f| File.file?(f)}.map{|f| File.expand_path(f)}
33
+
34
+ # Check we have some files to load
35
+ unless load_hash[table].any?
36
+ raise LoadError, "No files to load in folder #{folder}"
37
+ end
38
+
39
+ # Now we have converted the folder and table
40
+ # into a map, we can use load_from_map()
41
+ load_from_hash(load_hash, db, 1, separator, encloser)
42
+ end
43
+ module_function :load_from_folder
44
+
45
+ # Load Infobright using a hash of
46
+ # tables to filenames.
47
+ def load_from_hash(load_hash, db, processes=10, separator='|', encloser='')
48
+
49
+ # Check we have some tables
50
+ t_count = load_hash.length
51
+
52
+ # Some validation about the load we're going to do
53
+ case
54
+ when t_count == 0
55
+ raise LoadError, "We have no tables to populate"
56
+ when t_count < processes
57
+ puts "We have only #{t_count} table(s) to populate, reducing processes from #{processes} to #{t_count}" # TODO: move to Ruby logger?
58
+ processes = t_count
59
+ end
60
+
61
+ # Now let's check MySQL server is accessible
62
+ unless InfobrightLoader::Db.running?(db)
63
+ raise LoadError, "Default MySQL server cannot be found or is not running"
64
+ end
65
+
66
+ # Now let's check that we can access the database
67
+ unless InfobrightLoader::Db.db_exists?(db)
68
+ raise LoadError, "Database #{db.name} cannot be found or user lacks sufficient privileges"
69
+ end
70
+
71
+ # Now we're ready to start with the load - either parallel or serial
72
+ if t_count == 1
73
+ table, files = load_hash.first
74
+ failures = load_table(files, table, db, separator, encloser)
75
+ else
76
+ failures = load_parallel(load_hash, db, processes, separator, encloser)
77
+ # failures = load_serial(load_hash, db, separator, encloser) # For debugging without worrying about threads.
78
+ end
79
+
80
+ failures # Return failures
81
+ end
82
+ module_function :load_from_hash
83
+
84
+ private
85
+
86
+ # Load a single table
87
+ def load_table(files, table, db, separator, encloser)
88
+
89
+ failures = [] # Tables we didn't manage to load
90
+
91
+ files.each { |f|
92
+ puts "Loading file #{f} into table #{db.name}.#{table}" # TODO: move to Ruby logger?
93
+ begin
94
+ InfobrightLoader::Db.load_file(f, table, db, separator, encloser)
95
+ rescue LoadError => le
96
+ puts "LOAD ERROR: %s" % le # TODO: move to Ruby logger?
97
+ failures << "%s (%s)" % [f, le]
98
+ end
99
+ }
100
+ failures
101
+ end
102
+ module_function :load_table
103
+
104
+ # Perform a serial load
105
+ # Only used for debugging
106
+ def load_serial(load_hash, db, separator, encloser)
107
+
108
+ files_not_loaded = []
109
+
110
+ load_hash.keys.each { |k|
111
+ failures = load_table(load_hash[k], k, db, separator, encloser)
112
+ unless failures.empty?
113
+ files_not_loaded.concat failures
114
+ end
115
+ }
116
+ files_not_loaded
117
+ end
118
+ module_function :load_serial
119
+
120
+ # Perform a parallel load
121
+ def load_parallel(load_hash, db, processes, separator, encloser)
122
+
123
+ tables_to_load = load_hash.keys
124
+ table = nil
125
+ threads = []
126
+ files_not_loaded = []
127
+ complete = false
128
+ mutex = Mutex.new
129
+
130
+ # If an exception is thrown in a thread that isn't handled, die quickly
131
+ Thread.abort_on_exception = true
132
+
133
+ # Create Ruby threads to concurrently execute Infobright loads
134
+ for i in (0...processes)
135
+
136
+ # Each thread pops a table off the tables_to_load array, and loads files into it.
137
+ # We loop until there are no more tables to populate.
138
+ threads << Thread.new do
139
+ loop do
140
+
141
+ # Critical section
142
+ # Only allow one thread to modify the array at any time
143
+ mutex.synchronize do
144
+ if tables_to_load.length == 0
145
+ complete = true
146
+ end
147
+ table = tables_to_load.pop
148
+ end
149
+
150
+ # Let's quit if we have no table to load
151
+ break if complete # Exit the thread
152
+
153
+ # Otherwise let's run through and do all the loads for this table
154
+ failures = load_table(load_hash[table], table, db, separator, encloser)
155
+
156
+ # Also critical: only one thread should update the failures
157
+ # list at a time
158
+ mutex.synchronize do
159
+ unless failures.empty?
160
+ files_not_loaded.concat failures
161
+ end
162
+ end
163
+
164
+ end
165
+ end
166
+ end
167
+
168
+ # Wait for threads to finish
169
+ threads.each { |aThread| aThread.join }
170
+ files_not_loaded
171
+ end
172
+ module_function :load_parallel
173
+
174
+ end
175
+ end
@@ -0,0 +1,4 @@
1
+ module InfobrightLoader
2
+ NAME = "infobright-loader"
3
+ VERSION = "0.0.1"
4
+ end
@@ -0,0 +1 @@
1
+ control-file-out.yml
@@ -0,0 +1,50 @@
1
+ # Control file for Infobright Ruby Loader control file test
2
+ #
3
+ # Is processed with sed to finalise <<x>> variables before
4
+ # the test is run
5
+
6
+ :load:
7
+ :processes: 3 # Test with 3 processes populating 5 tables
8
+ :database:
9
+ :name: wrong # Will override at the command line to irl_tests
10
+ :username: <<USERNAME>>
11
+ :password: <<PASSWORD>>
12
+ :data_format:
13
+ :separator: \|
14
+ :encloser: \' # Will override at the command line to "
15
+ :data_loads:
16
+ b:
17
+ - <<PATH>>/data/b/b_1.txt # Bad data
18
+ - <<PATH>>/data/b/b_2.txt
19
+ - <<PATH>>/data/b/b_3.txt
20
+ - <<PATH>>/data/b/b_4.txt
21
+ - <<PATH>>/data/b/b_5.txt
22
+ - <<PATH>>/data/b/b_6.txt
23
+ c:
24
+ - <<PATH>>/data/c/c_1.txt
25
+ - <<PATH>>/data/c/c_2.txt
26
+ - <<PATH>>/data/c/BAD_3.txt
27
+ - <<PATH>>/data/c/c_4.txt
28
+ - <<PATH>>/data/c/c_5.txt
29
+ - <<PATH>>/data/c/c_6.txt
30
+ d:
31
+ - <<PATH>>/data/d/BAD_1.txt
32
+ - <<PATH>>/data/d/d_2.txt
33
+ - <<PATH>>/data/d/d_3.txt
34
+ - <<PATH>>/data/d/d_4.txt
35
+ - <<PATH>>/data/d/d_5.txt # Bad data
36
+ - <<PATH>>/data/d/d_6.txt
37
+ e:
38
+ - <<PATH>>/data/e/e_1.txt
39
+ - <<PATH>>/data/e/e_2.txt
40
+ - <<PATH>>/data/e/e_3.txt
41
+ - <<PATH>>/data/e/BAD_4.txt
42
+ - <<PATH>>/data/e/e_5.txt
43
+ - <<PATH>>/data/e/BAD_6.txt
44
+ f:
45
+ - <<PATH>>/data/f/f_1.txt
46
+ - <<PATH>>/data/f/f_2.txt
47
+ - <<PATH>>/data/f/f_3.txt
48
+ - <<PATH>>/data/f/f_4.txt
49
+ - <<PATH>>/data/f/f_5.txt
50
+ - <<PATH>>/data/f/f_6.txt
@@ -0,0 +1,6 @@
1
+ "1"|"B.1"
2
+ "2"|"B.2"
3
+ "3"|"B.3"
4
+ "4"|"B.4"
5
+ bad
6
+ "6"|"B.6"
@@ -0,0 +1,6 @@
1
+ "7"|"B.7"
2
+ "8"|"B.8"
3
+ "9"|"B.9"
4
+ "10"|"B.10"
5
+ "11"|"B.11"
6
+ "12"|"B.12"
@@ -0,0 +1,6 @@
1
+ "13"|"B.13"
2
+ "14"|"B.14"
3
+ "15"|"B.15"
4
+ "16"|"B.16"
5
+ "17"|"B.17"
6
+ "18"|"B.18"
@@ -0,0 +1,6 @@
1
+ "19"|"B.19"
2
+ "20"|"B.20"
3
+ "21"|"B.21"
4
+ "22"|"B.22"
5
+ "23"|"B.23"
6
+ "24"|"B.24"
@@ -0,0 +1,6 @@
1
+ "25"|"B.25"
2
+ "26"|"B.26"
3
+ "27"|"B.27"
4
+ "28"|"B.28"
5
+ "29"|"B.29"
6
+ "30"|"B.30"
@@ -0,0 +1,6 @@
1
+ "31"|"B.31"
2
+ "32"|"B.32"
3
+ "33"|"B.33"
4
+ "34"|"B.34"
5
+ "35"|"B.35"
6
+ "36"|"B.36"
@@ -0,0 +1,6 @@
1
+ "1"|"C.1"
2
+ "2"|"C.2"
3
+ "3"|"C.3"
4
+ "4"|"C.4"
5
+ "5"|"C.5"
6
+ "6"|"C.6"
@@ -0,0 +1,6 @@
1
+ "7"|"C.7"
2
+ "8"|"C.8"
3
+ "9"|"C.9"
4
+ "10"|"C.10"
5
+ "11"|"C.11"
6
+ "12"|"C.12"
@@ -0,0 +1,6 @@
1
+ "13"|"C.13"
2
+ "14"|"C.14"
3
+ "15"|"C.15"
4
+ "16"|"C.16"
5
+ "17"|"C.17"
6
+ "18"|"C.18"
@@ -0,0 +1,6 @@
1
+ "19"|"C.19"
2
+ "20"|"C.20"
3
+ "21"|"C.21"
4
+ "22"|"C.22"
5
+ "23"|"C.23"
6
+ "24"|"C.24"
@@ -0,0 +1,6 @@
1
+ "25"|"C.25"
2
+ "26"|"C.26"
3
+ "27"|"C.27"
4
+ "28"|"C.28"
5
+ "29"|"C.29"
6
+ "30"|"C.30"
@@ -0,0 +1,6 @@
1
+ "31"|"C.31"
2
+ "32"|"C.32"
3
+ "33"|"C.33"
4
+ "34"|"C.34"
5
+ "35"|"C.35"
6
+ "36"|"C.36"
@@ -0,0 +1,6 @@
1
+ "1"|"D.1"
2
+ "2"|"D.2"
3
+ "3"|"D.3"
4
+ "4"|"D.4"
5
+ "5"|"D.5"
6
+ "6"|"D.6"
@@ -0,0 +1,6 @@
1
+ "7"|"D.7"
2
+ "8"|"D.8"
3
+ "9"|"D.9"
4
+ "10"|"D.10"
5
+ "11"|"D.11"
6
+ "12"|"D.12"
@@ -0,0 +1,6 @@
1
+ "13"|"D.13"
2
+ "14"|"D.14"
3
+ "15"|"D.15"
4
+ "16"|"D.16"
5
+ "17"|"D.17"
6
+ "18"|"D.18"
@@ -0,0 +1,6 @@
1
+ "19"|"D.19"
2
+ "20"|"D.20"
3
+ "21"|"D.21"
4
+ "22"|"D.22"
5
+ "23"|"D.23"
6
+ "24"|"D.24"
@@ -0,0 +1,6 @@
1
+ "25"|"D.25"
2
+ "26"|"D.26"
3
+ bad
4
+ "28"|"D.28"
5
+ "29"|"D.29"
6
+ "30"|"D.30"
@@ -0,0 +1,6 @@
1
+ "31"|"D.31"
2
+ "32"|"D.32"
3
+ "33"|"D.33"
4
+ "34"|"D.34"
5
+ "35"|"D.35"
6
+ "36"|"D.36"
@@ -0,0 +1,6 @@
1
+ "1"|"E.1"
2
+ "2"|"E.2"
3
+ "3"|"E.3"
4
+ "4"|"E.4"
5
+ "5"|"E.5"
6
+ "6"|"E.6"
@@ -0,0 +1,6 @@
1
+ "7"|"E.7"
2
+ "8"|"E.8"
3
+ "9"|"E.9"
4
+ "10"|"E.10"
5
+ "11"|"E.11"
6
+ "12"|"E.12"
@@ -0,0 +1,6 @@
1
+ "13"|"E.13"
2
+ "14"|"E.14"
3
+ "15"|"E.15"
4
+ "16"|"E.16"
5
+ "17"|"E.17"
6
+ "18"|"E.18"
@@ -0,0 +1,6 @@
1
+ "19"|"E.19"
2
+ "20"|"E.20"
3
+ "21"|"E.21"
4
+ "22"|"E.22"
5
+ "23"|"E.23"
6
+ "24"|"E.24"
@@ -0,0 +1,6 @@
1
+ "25"|"E.25"
2
+ "26"|"E.26"
3
+ "27"|"E.27"
4
+ "28"|"E.28"
5
+ "29"|"E.29"
6
+ "30"|"E.30"
@@ -0,0 +1,6 @@
1
+ "31"|"E.31"
2
+ "32"|"E.32"
3
+ "33"|"E.33"
4
+ "34"|"E.34"
5
+ "35"|"E.35"
6
+ "36"|"E.36"
@@ -0,0 +1,6 @@
1
+ "1"|"F.1"
2
+ "2"|"F.2"
3
+ "3"|"F.3"
4
+ "4"|"F.4"
5
+ "5"|"F.5"
6
+ "6"|"F.6"
@@ -0,0 +1,6 @@
1
+ "7"|"F.7"
2
+ "8"|"F.8"
3
+ "9"|"F.9"
4
+ "10"|"F.10"
5
+ "11"|"F.11"
6
+ "12"|"F.12"
@@ -0,0 +1,6 @@
1
+ "13"|"F.13"
2
+ "14"|"F.14"
3
+ "15"|"F.15"
4
+ "16"|"F.16"
5
+ "17"|"F.17"
6
+ "18"|"F.18"