infobright-loader 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. data/.gitignore +17 -0
  2. data/Gemfile +19 -0
  3. data/LICENSE-2.0.txt +202 -0
  4. data/README.md +232 -0
  5. data/Rakefile +2 -0
  6. data/bin/infobright-loader +45 -0
  7. data/control-file/template.yml +23 -0
  8. data/infobright-loader.gemspec +35 -0
  9. data/lib/infobright-loader.rb +5 -0
  10. data/lib/infobright-loader/cli/config.rb +204 -0
  11. data/lib/infobright-loader/cli/loader.rb +66 -0
  12. data/lib/infobright-loader/db.rb +90 -0
  13. data/lib/infobright-loader/loader.rb +175 -0
  14. data/lib/infobright-loader/version.rb +4 -0
  15. data/tests/manual/bad-files/.gitignore +1 -0
  16. data/tests/manual/bad-files/control-file-raw.yml +50 -0
  17. data/tests/manual/bad-files/data/b/b_1.txt +6 -0
  18. data/tests/manual/bad-files/data/b/b_2.txt +6 -0
  19. data/tests/manual/bad-files/data/b/b_3.txt +6 -0
  20. data/tests/manual/bad-files/data/b/b_4.txt +6 -0
  21. data/tests/manual/bad-files/data/b/b_5.txt +6 -0
  22. data/tests/manual/bad-files/data/b/b_6.txt +6 -0
  23. data/tests/manual/bad-files/data/c/c_1.txt +6 -0
  24. data/tests/manual/bad-files/data/c/c_2.txt +6 -0
  25. data/tests/manual/bad-files/data/c/c_3.txt +6 -0
  26. data/tests/manual/bad-files/data/c/c_4.txt +6 -0
  27. data/tests/manual/bad-files/data/c/c_5.txt +6 -0
  28. data/tests/manual/bad-files/data/c/c_6.txt +6 -0
  29. data/tests/manual/bad-files/data/d/d_1.txt +6 -0
  30. data/tests/manual/bad-files/data/d/d_2.txt +6 -0
  31. data/tests/manual/bad-files/data/d/d_3.txt +6 -0
  32. data/tests/manual/bad-files/data/d/d_4.txt +6 -0
  33. data/tests/manual/bad-files/data/d/d_5.txt +6 -0
  34. data/tests/manual/bad-files/data/d/d_6.txt +6 -0
  35. data/tests/manual/bad-files/data/e/e_1.txt +6 -0
  36. data/tests/manual/bad-files/data/e/e_2.txt +6 -0
  37. data/tests/manual/bad-files/data/e/e_3.txt +6 -0
  38. data/tests/manual/bad-files/data/e/e_4.txt +6 -0
  39. data/tests/manual/bad-files/data/e/e_5.txt +6 -0
  40. data/tests/manual/bad-files/data/e/e_6.txt +6 -0
  41. data/tests/manual/bad-files/data/f/f_1.txt +6 -0
  42. data/tests/manual/bad-files/data/f/f_2.txt +6 -0
  43. data/tests/manual/bad-files/data/f/f_3.txt +6 -0
  44. data/tests/manual/bad-files/data/f/f_4.txt +6 -0
  45. data/tests/manual/bad-files/data/f/f_5.txt +6 -0
  46. data/tests/manual/bad-files/data/f/f_6.txt +6 -0
  47. data/tests/manual/bad-files/run_test.sh +40 -0
  48. data/tests/manual/bad-files/setup.sql +8 -0
  49. data/tests/manual/bad-files/verify.sql +10 -0
  50. data/tests/manual/control-file/.gitignore +1 -0
  51. data/tests/manual/control-file/control-file-raw.yml +50 -0
  52. data/tests/manual/control-file/data/b/b_1.txt +6 -0
  53. data/tests/manual/control-file/data/b/b_2.txt +6 -0
  54. data/tests/manual/control-file/data/b/b_3.txt +6 -0
  55. data/tests/manual/control-file/data/b/b_4.txt +6 -0
  56. data/tests/manual/control-file/data/b/b_5.txt +6 -0
  57. data/tests/manual/control-file/data/b/b_6.txt +6 -0
  58. data/tests/manual/control-file/data/c/c_1.txt +6 -0
  59. data/tests/manual/control-file/data/c/c_2.txt +6 -0
  60. data/tests/manual/control-file/data/c/c_3.txt +6 -0
  61. data/tests/manual/control-file/data/c/c_4.txt +6 -0
  62. data/tests/manual/control-file/data/c/c_5.txt +6 -0
  63. data/tests/manual/control-file/data/c/c_6.txt +6 -0
  64. data/tests/manual/control-file/data/d/d_1.txt +6 -0
  65. data/tests/manual/control-file/data/d/d_2.txt +6 -0
  66. data/tests/manual/control-file/data/d/d_3.txt +6 -0
  67. data/tests/manual/control-file/data/d/d_4.txt +6 -0
  68. data/tests/manual/control-file/data/d/d_5.txt +6 -0
  69. data/tests/manual/control-file/data/d/d_6.txt +6 -0
  70. data/tests/manual/control-file/data/e/e_1.txt +6 -0
  71. data/tests/manual/control-file/data/e/e_2.txt +6 -0
  72. data/tests/manual/control-file/data/e/e_3.txt +6 -0
  73. data/tests/manual/control-file/data/e/e_4.txt +6 -0
  74. data/tests/manual/control-file/data/e/e_5.txt +6 -0
  75. data/tests/manual/control-file/data/e/e_6.txt +6 -0
  76. data/tests/manual/control-file/data/f/f_1.txt +6 -0
  77. data/tests/manual/control-file/data/f/f_2.txt +6 -0
  78. data/tests/manual/control-file/data/f/f_3.txt +6 -0
  79. data/tests/manual/control-file/data/f/f_4.txt +6 -0
  80. data/tests/manual/control-file/data/f/f_5.txt +6 -0
  81. data/tests/manual/control-file/data/f/f_6.txt +6 -0
  82. data/tests/manual/control-file/run_test.sh +40 -0
  83. data/tests/manual/control-file/setup.sql +8 -0
  84. data/tests/manual/control-file/verify.sql +10 -0
  85. data/tests/manual/folder/data/a/a_1.txt +6 -0
  86. data/tests/manual/folder/data/a/a_2.txt +6 -0
  87. data/tests/manual/folder/data/a/a_3.txt +6 -0
  88. data/tests/manual/folder/data/a/a_4.txt +6 -0
  89. data/tests/manual/folder/data/a/a_5.txt +6 -0
  90. data/tests/manual/folder/data/a/a_6.txt +6 -0
  91. data/tests/manual/folder/run_test.sh +37 -0
  92. data/tests/manual/folder/setup.sql +4 -0
  93. data/tests/manual/folder/verify.sql +2 -0
  94. metadata +158 -0
@@ -0,0 +1,175 @@
1
+ # Copyright (c) 2012 SnowPlow Analytics Ltd. All rights reserved.
2
+ #
3
+ # This program is licensed to you under the Apache License Version 2.0,
4
+ # and you may not use this file except in compliance with the Apache License Version 2.0.
5
+ # You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
6
+ #
7
+ # Unless required by applicable law or agreed to in writing,
8
+ # software distributed under the Apache License Version 2.0 is distributed on an
9
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
11
+
12
+ # Author:: Alex Dean (mailto:support@snowplowanalytics.com)
13
+ # Copyright:: Copyright (c) 2012 SnowPlow Analytics Ltd
14
+ # License:: Apache License Version 2.0
15
+
16
+ require 'thread'
17
+
18
+ require 'infobright-loader/db'
19
+
20
+ module InfobrightLoader
21
+ module Loader
22
+
23
+ # For errors
24
+ class LoadError < ArgumentError; end
25
+
26
+ # Load a single table in Infobright with
27
+ # the contents of a single folder
28
+ def load_from_folder(folder, table, db, separator='|', encloser='')
29
+
30
+ # Let's loop through and grab all absolute paths to all the files in this folder, recursively
31
+ load_hash = {}
32
+ load_hash[table] = Dir["#{folder}**/*"].find_all{|f| File.file?(f)}.map{|f| File.expand_path(f)}
33
+
34
+ # Check we have some files to load
35
+ unless load_hash[table].any?
36
+ raise LoadError, "No files to load in folder #{folder}"
37
+ end
38
+
39
+ # Now we have converted the folder and table
40
+ # into a map, we can use load_from_map()
41
+ load_from_hash(load_hash, db, 1, separator, encloser)
42
+ end
43
+ module_function :load_from_folder
44
+
45
+ # Load Infobright using a hash of
46
+ # tables to filenames.
47
+ def load_from_hash(load_hash, db, processes=10, separator='|', encloser='')
48
+
49
+ # Check we have some tables
50
+ t_count = load_hash.length
51
+
52
+ # Some validation about the load we're going to do
53
+ case
54
+ when t_count == 0
55
+ raise LoadError, "We have no tables to populate"
56
+ when t_count < processes
57
+ puts "We have only #{t_count} table(s) to populate, reducing processes from #{processes} to #{t_count}" # TODO: move to Ruby logger?
58
+ processes = t_count
59
+ end
60
+
61
+ # Now let's check MySQL server is accessible
62
+ unless InfobrightLoader::Db.running?(db)
63
+ raise LoadError, "Default MySQL server cannot be found or is not running"
64
+ end
65
+
66
+ # Now let's check that we can access the database
67
+ unless InfobrightLoader::Db.db_exists?(db)
68
+ raise LoadError, "Database #{db.name} cannot be found or user lacks sufficient privileges"
69
+ end
70
+
71
+ # Now we're ready to start with the load - either parallel or serial
72
+ if t_count == 1
73
+ table, files = load_hash.first
74
+ failures = load_table(files, table, db, separator, encloser)
75
+ else
76
+ failures = load_parallel(load_hash, db, processes, separator, encloser)
77
+ # failures = load_serial(load_hash, db, separator, encloser) # For debugging without worrying about threads.
78
+ end
79
+
80
+ failures # Return failures
81
+ end
82
+ module_function :load_from_hash
83
+
84
+ private
85
+
86
+ # Load a single table
87
+ def load_table(files, table, db, separator, encloser)
88
+
89
+ failures = [] # Tables we didn't manage to load
90
+
91
+ files.each { |f|
92
+ puts "Loading file #{f} into table #{db.name}.#{table}" # TODO: move to Ruby logger?
93
+ begin
94
+ InfobrightLoader::Db.load_file(f, table, db, separator, encloser)
95
+ rescue LoadError => le
96
+ puts "LOAD ERROR: %s" % le # TODO: move to Ruby logger?
97
+ failures << "%s (%s)" % [f, le]
98
+ end
99
+ }
100
+ failures
101
+ end
102
+ module_function :load_table
103
+
104
+ # Perform a serial load
105
+ # Only used for debugging
106
+ def load_serial(load_hash, db, separator, encloser)
107
+
108
+ files_not_loaded = []
109
+
110
+ load_hash.keys.each { |k|
111
+ failures = load_table(load_hash[k], k, db, separator, encloser)
112
+ unless failures.empty?
113
+ files_not_loaded.concat failures
114
+ end
115
+ }
116
+ files_not_loaded
117
+ end
118
+ module_function :load_serial
119
+
120
+ # Perform a parallel load
121
+ def load_parallel(load_hash, db, processes, separator, encloser)
122
+
123
+ tables_to_load = load_hash.keys
124
+ table = nil
125
+ threads = []
126
+ files_not_loaded = []
127
+ complete = false
128
+ mutex = Mutex.new
129
+
130
+ # If an exception is thrown in a thread that isn't handled, die quickly
131
+ Thread.abort_on_exception = true
132
+
133
+ # Create Ruby threads to concurrently execute Infobright loads
134
+ for i in (0...processes)
135
+
136
+ # Each thread pops a table off the tables_to_load array, and loads files into it.
137
+ # We loop until there are no more tables to populate.
138
+ threads << Thread.new do
139
+ loop do
140
+
141
+ # Critical section
142
+ # Only allow one thread to modify the array at any time
143
+ mutex.synchronize do
144
+ if tables_to_load.length == 0
145
+ complete = true
146
+ end
147
+ table = tables_to_load.pop
148
+ end
149
+
150
+ # Let's quit if we have no table to load
151
+ break if complete # Exit the thread
152
+
153
+ # Otherwise let's run through and do all the loads for this table
154
+ failures = load_table(load_hash[table], table, db, separator, encloser)
155
+
156
+ # Also critical: only one thread should update the failures
157
+ # list at a time
158
+ mutex.synchronize do
159
+ unless failures.empty?
160
+ files_not_loaded.concat failures
161
+ end
162
+ end
163
+
164
+ end
165
+ end
166
+ end
167
+
168
+ # Wait for threads to finish
169
+ threads.each { |aThread| aThread.join }
170
+ files_not_loaded
171
+ end
172
+ module_function :load_parallel
173
+
174
+ end
175
+ end
@@ -0,0 +1,4 @@
1
+ module InfobrightLoader
2
+ NAME = "infobright-loader"
3
+ VERSION = "0.0.1"
4
+ end
@@ -0,0 +1 @@
1
+ control-file-out.yml
@@ -0,0 +1,50 @@
1
+ # Control file for Infobright Ruby Loader control file test
2
+ #
3
+ # Is processed with sed to finalise <<x>> variables before
4
+ # the test is run
5
+
6
+ :load:
7
+ :processes: 3 # Test with 3 processes populating 5 tables
8
+ :database:
9
+ :name: wrong # Will override at the command line to irl_tests
10
+ :username: <<USERNAME>>
11
+ :password: <<PASSWORD>>
12
+ :data_format:
13
+ :separator: \|
14
+ :encloser: \' # Will override at the command line to "
15
+ :data_loads:
16
+ b:
17
+ - <<PATH>>/data/b/b_1.txt # Bad data
18
+ - <<PATH>>/data/b/b_2.txt
19
+ - <<PATH>>/data/b/b_3.txt
20
+ - <<PATH>>/data/b/b_4.txt
21
+ - <<PATH>>/data/b/b_5.txt
22
+ - <<PATH>>/data/b/b_6.txt
23
+ c:
24
+ - <<PATH>>/data/c/c_1.txt
25
+ - <<PATH>>/data/c/c_2.txt
26
+ - <<PATH>>/data/c/BAD_3.txt
27
+ - <<PATH>>/data/c/c_4.txt
28
+ - <<PATH>>/data/c/c_5.txt
29
+ - <<PATH>>/data/c/c_6.txt
30
+ d:
31
+ - <<PATH>>/data/d/BAD_1.txt
32
+ - <<PATH>>/data/d/d_2.txt
33
+ - <<PATH>>/data/d/d_3.txt
34
+ - <<PATH>>/data/d/d_4.txt
35
+ - <<PATH>>/data/d/d_5.txt # Bad data
36
+ - <<PATH>>/data/d/d_6.txt
37
+ e:
38
+ - <<PATH>>/data/e/e_1.txt
39
+ - <<PATH>>/data/e/e_2.txt
40
+ - <<PATH>>/data/e/e_3.txt
41
+ - <<PATH>>/data/e/BAD_4.txt
42
+ - <<PATH>>/data/e/e_5.txt
43
+ - <<PATH>>/data/e/BAD_6.txt
44
+ f:
45
+ - <<PATH>>/data/f/f_1.txt
46
+ - <<PATH>>/data/f/f_2.txt
47
+ - <<PATH>>/data/f/f_3.txt
48
+ - <<PATH>>/data/f/f_4.txt
49
+ - <<PATH>>/data/f/f_5.txt
50
+ - <<PATH>>/data/f/f_6.txt
@@ -0,0 +1,6 @@
1
+ "1"|"B.1"
2
+ "2"|"B.2"
3
+ "3"|"B.3"
4
+ "4"|"B.4"
5
+ bad
6
+ "6"|"B.6"
@@ -0,0 +1,6 @@
1
+ "7"|"B.7"
2
+ "8"|"B.8"
3
+ "9"|"B.9"
4
+ "10"|"B.10"
5
+ "11"|"B.11"
6
+ "12"|"B.12"
@@ -0,0 +1,6 @@
1
+ "13"|"B.13"
2
+ "14"|"B.14"
3
+ "15"|"B.15"
4
+ "16"|"B.16"
5
+ "17"|"B.17"
6
+ "18"|"B.18"
@@ -0,0 +1,6 @@
1
+ "19"|"B.19"
2
+ "20"|"B.20"
3
+ "21"|"B.21"
4
+ "22"|"B.22"
5
+ "23"|"B.23"
6
+ "24"|"B.24"
@@ -0,0 +1,6 @@
1
+ "25"|"B.25"
2
+ "26"|"B.26"
3
+ "27"|"B.27"
4
+ "28"|"B.28"
5
+ "29"|"B.29"
6
+ "30"|"B.30"
@@ -0,0 +1,6 @@
1
+ "31"|"B.31"
2
+ "32"|"B.32"
3
+ "33"|"B.33"
4
+ "34"|"B.34"
5
+ "35"|"B.35"
6
+ "36"|"B.36"
@@ -0,0 +1,6 @@
1
+ "1"|"C.1"
2
+ "2"|"C.2"
3
+ "3"|"C.3"
4
+ "4"|"C.4"
5
+ "5"|"C.5"
6
+ "6"|"C.6"
@@ -0,0 +1,6 @@
1
+ "7"|"C.7"
2
+ "8"|"C.8"
3
+ "9"|"C.9"
4
+ "10"|"C.10"
5
+ "11"|"C.11"
6
+ "12"|"C.12"
@@ -0,0 +1,6 @@
1
+ "13"|"C.13"
2
+ "14"|"C.14"
3
+ "15"|"C.15"
4
+ "16"|"C.16"
5
+ "17"|"C.17"
6
+ "18"|"C.18"
@@ -0,0 +1,6 @@
1
+ "19"|"C.19"
2
+ "20"|"C.20"
3
+ "21"|"C.21"
4
+ "22"|"C.22"
5
+ "23"|"C.23"
6
+ "24"|"C.24"
@@ -0,0 +1,6 @@
1
+ "25"|"C.25"
2
+ "26"|"C.26"
3
+ "27"|"C.27"
4
+ "28"|"C.28"
5
+ "29"|"C.29"
6
+ "30"|"C.30"
@@ -0,0 +1,6 @@
1
+ "31"|"C.31"
2
+ "32"|"C.32"
3
+ "33"|"C.33"
4
+ "34"|"C.34"
5
+ "35"|"C.35"
6
+ "36"|"C.36"
@@ -0,0 +1,6 @@
1
+ "1"|"D.1"
2
+ "2"|"D.2"
3
+ "3"|"D.3"
4
+ "4"|"D.4"
5
+ "5"|"D.5"
6
+ "6"|"D.6"
@@ -0,0 +1,6 @@
1
+ "7"|"D.7"
2
+ "8"|"D.8"
3
+ "9"|"D.9"
4
+ "10"|"D.10"
5
+ "11"|"D.11"
6
+ "12"|"D.12"
@@ -0,0 +1,6 @@
1
+ "13"|"D.13"
2
+ "14"|"D.14"
3
+ "15"|"D.15"
4
+ "16"|"D.16"
5
+ "17"|"D.17"
6
+ "18"|"D.18"
@@ -0,0 +1,6 @@
1
+ "19"|"D.19"
2
+ "20"|"D.20"
3
+ "21"|"D.21"
4
+ "22"|"D.22"
5
+ "23"|"D.23"
6
+ "24"|"D.24"
@@ -0,0 +1,6 @@
1
+ "25"|"D.25"
2
+ "26"|"D.26"
3
+ bad
4
+ "28"|"D.28"
5
+ "29"|"D.29"
6
+ "30"|"D.30"
@@ -0,0 +1,6 @@
1
+ "31"|"D.31"
2
+ "32"|"D.32"
3
+ "33"|"D.33"
4
+ "34"|"D.34"
5
+ "35"|"D.35"
6
+ "36"|"D.36"
@@ -0,0 +1,6 @@
1
+ "1"|"E.1"
2
+ "2"|"E.2"
3
+ "3"|"E.3"
4
+ "4"|"E.4"
5
+ "5"|"E.5"
6
+ "6"|"E.6"
@@ -0,0 +1,6 @@
1
+ "7"|"E.7"
2
+ "8"|"E.8"
3
+ "9"|"E.9"
4
+ "10"|"E.10"
5
+ "11"|"E.11"
6
+ "12"|"E.12"
@@ -0,0 +1,6 @@
1
+ "13"|"E.13"
2
+ "14"|"E.14"
3
+ "15"|"E.15"
4
+ "16"|"E.16"
5
+ "17"|"E.17"
6
+ "18"|"E.18"
@@ -0,0 +1,6 @@
1
+ "19"|"E.19"
2
+ "20"|"E.20"
3
+ "21"|"E.21"
4
+ "22"|"E.22"
5
+ "23"|"E.23"
6
+ "24"|"E.24"
@@ -0,0 +1,6 @@
1
+ "25"|"E.25"
2
+ "26"|"E.26"
3
+ "27"|"E.27"
4
+ "28"|"E.28"
5
+ "29"|"E.29"
6
+ "30"|"E.30"
@@ -0,0 +1,6 @@
1
+ "31"|"E.31"
2
+ "32"|"E.32"
3
+ "33"|"E.33"
4
+ "34"|"E.34"
5
+ "35"|"E.35"
6
+ "36"|"E.36"
@@ -0,0 +1,6 @@
1
+ "1"|"F.1"
2
+ "2"|"F.2"
3
+ "3"|"F.3"
4
+ "4"|"F.4"
5
+ "5"|"F.5"
6
+ "6"|"F.6"
@@ -0,0 +1,6 @@
1
+ "7"|"F.7"
2
+ "8"|"F.8"
3
+ "9"|"F.9"
4
+ "10"|"F.10"
5
+ "11"|"F.11"
6
+ "12"|"F.12"
@@ -0,0 +1,6 @@
1
+ "13"|"F.13"
2
+ "14"|"F.14"
3
+ "15"|"F.15"
4
+ "16"|"F.16"
5
+ "17"|"F.17"
6
+ "18"|"F.18"