infobright-loader 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/Gemfile +19 -0
- data/LICENSE-2.0.txt +202 -0
- data/README.md +232 -0
- data/Rakefile +2 -0
- data/bin/infobright-loader +45 -0
- data/control-file/template.yml +23 -0
- data/infobright-loader.gemspec +35 -0
- data/lib/infobright-loader.rb +5 -0
- data/lib/infobright-loader/cli/config.rb +204 -0
- data/lib/infobright-loader/cli/loader.rb +66 -0
- data/lib/infobright-loader/db.rb +90 -0
- data/lib/infobright-loader/loader.rb +175 -0
- data/lib/infobright-loader/version.rb +4 -0
- data/tests/manual/bad-files/.gitignore +1 -0
- data/tests/manual/bad-files/control-file-raw.yml +50 -0
- data/tests/manual/bad-files/data/b/b_1.txt +6 -0
- data/tests/manual/bad-files/data/b/b_2.txt +6 -0
- data/tests/manual/bad-files/data/b/b_3.txt +6 -0
- data/tests/manual/bad-files/data/b/b_4.txt +6 -0
- data/tests/manual/bad-files/data/b/b_5.txt +6 -0
- data/tests/manual/bad-files/data/b/b_6.txt +6 -0
- data/tests/manual/bad-files/data/c/c_1.txt +6 -0
- data/tests/manual/bad-files/data/c/c_2.txt +6 -0
- data/tests/manual/bad-files/data/c/c_3.txt +6 -0
- data/tests/manual/bad-files/data/c/c_4.txt +6 -0
- data/tests/manual/bad-files/data/c/c_5.txt +6 -0
- data/tests/manual/bad-files/data/c/c_6.txt +6 -0
- data/tests/manual/bad-files/data/d/d_1.txt +6 -0
- data/tests/manual/bad-files/data/d/d_2.txt +6 -0
- data/tests/manual/bad-files/data/d/d_3.txt +6 -0
- data/tests/manual/bad-files/data/d/d_4.txt +6 -0
- data/tests/manual/bad-files/data/d/d_5.txt +6 -0
- data/tests/manual/bad-files/data/d/d_6.txt +6 -0
- data/tests/manual/bad-files/data/e/e_1.txt +6 -0
- data/tests/manual/bad-files/data/e/e_2.txt +6 -0
- data/tests/manual/bad-files/data/e/e_3.txt +6 -0
- data/tests/manual/bad-files/data/e/e_4.txt +6 -0
- data/tests/manual/bad-files/data/e/e_5.txt +6 -0
- data/tests/manual/bad-files/data/e/e_6.txt +6 -0
- data/tests/manual/bad-files/data/f/f_1.txt +6 -0
- data/tests/manual/bad-files/data/f/f_2.txt +6 -0
- data/tests/manual/bad-files/data/f/f_3.txt +6 -0
- data/tests/manual/bad-files/data/f/f_4.txt +6 -0
- data/tests/manual/bad-files/data/f/f_5.txt +6 -0
- data/tests/manual/bad-files/data/f/f_6.txt +6 -0
- data/tests/manual/bad-files/run_test.sh +40 -0
- data/tests/manual/bad-files/setup.sql +8 -0
- data/tests/manual/bad-files/verify.sql +10 -0
- data/tests/manual/control-file/.gitignore +1 -0
- data/tests/manual/control-file/control-file-raw.yml +50 -0
- data/tests/manual/control-file/data/b/b_1.txt +6 -0
- data/tests/manual/control-file/data/b/b_2.txt +6 -0
- data/tests/manual/control-file/data/b/b_3.txt +6 -0
- data/tests/manual/control-file/data/b/b_4.txt +6 -0
- data/tests/manual/control-file/data/b/b_5.txt +6 -0
- data/tests/manual/control-file/data/b/b_6.txt +6 -0
- data/tests/manual/control-file/data/c/c_1.txt +6 -0
- data/tests/manual/control-file/data/c/c_2.txt +6 -0
- data/tests/manual/control-file/data/c/c_3.txt +6 -0
- data/tests/manual/control-file/data/c/c_4.txt +6 -0
- data/tests/manual/control-file/data/c/c_5.txt +6 -0
- data/tests/manual/control-file/data/c/c_6.txt +6 -0
- data/tests/manual/control-file/data/d/d_1.txt +6 -0
- data/tests/manual/control-file/data/d/d_2.txt +6 -0
- data/tests/manual/control-file/data/d/d_3.txt +6 -0
- data/tests/manual/control-file/data/d/d_4.txt +6 -0
- data/tests/manual/control-file/data/d/d_5.txt +6 -0
- data/tests/manual/control-file/data/d/d_6.txt +6 -0
- data/tests/manual/control-file/data/e/e_1.txt +6 -0
- data/tests/manual/control-file/data/e/e_2.txt +6 -0
- data/tests/manual/control-file/data/e/e_3.txt +6 -0
- data/tests/manual/control-file/data/e/e_4.txt +6 -0
- data/tests/manual/control-file/data/e/e_5.txt +6 -0
- data/tests/manual/control-file/data/e/e_6.txt +6 -0
- data/tests/manual/control-file/data/f/f_1.txt +6 -0
- data/tests/manual/control-file/data/f/f_2.txt +6 -0
- data/tests/manual/control-file/data/f/f_3.txt +6 -0
- data/tests/manual/control-file/data/f/f_4.txt +6 -0
- data/tests/manual/control-file/data/f/f_5.txt +6 -0
- data/tests/manual/control-file/data/f/f_6.txt +6 -0
- data/tests/manual/control-file/run_test.sh +40 -0
- data/tests/manual/control-file/setup.sql +8 -0
- data/tests/manual/control-file/verify.sql +10 -0
- data/tests/manual/folder/data/a/a_1.txt +6 -0
- data/tests/manual/folder/data/a/a_2.txt +6 -0
- data/tests/manual/folder/data/a/a_3.txt +6 -0
- data/tests/manual/folder/data/a/a_4.txt +6 -0
- data/tests/manual/folder/data/a/a_5.txt +6 -0
- data/tests/manual/folder/data/a/a_6.txt +6 -0
- data/tests/manual/folder/run_test.sh +37 -0
- data/tests/manual/folder/setup.sql +4 -0
- data/tests/manual/folder/verify.sql +2 -0
- metadata +158 -0
@@ -0,0 +1,175 @@
|
|
1
|
+
# Copyright (c) 2012 SnowPlow Analytics Ltd. All rights reserved.
|
2
|
+
#
|
3
|
+
# This program is licensed to you under the Apache License Version 2.0,
|
4
|
+
# and you may not use this file except in compliance with the Apache License Version 2.0.
|
5
|
+
# You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
|
6
|
+
#
|
7
|
+
# Unless required by applicable law or agreed to in writing,
|
8
|
+
# software distributed under the Apache License Version 2.0 is distributed on an
|
9
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10
|
+
# See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
|
11
|
+
|
12
|
+
# Author:: Alex Dean (mailto:support@snowplowanalytics.com)
|
13
|
+
# Copyright:: Copyright (c) 2012 SnowPlow Analytics Ltd
|
14
|
+
# License:: Apache License Version 2.0
|
15
|
+
|
16
|
+
require 'thread'
|
17
|
+
|
18
|
+
require 'infobright-loader/db'
|
19
|
+
|
20
|
+
module InfobrightLoader
|
21
|
+
module Loader
|
22
|
+
|
23
|
+
# For errors
|
24
|
+
class LoadError < ArgumentError; end
|
25
|
+
|
26
|
+
# Load a single table in Infobright with
|
27
|
+
# the contents of a single folder
|
28
|
+
def load_from_folder(folder, table, db, separator='|', encloser='')
|
29
|
+
|
30
|
+
# Let's loop through and grab all absolute paths to all the files in this folder, recursively
|
31
|
+
load_hash = {}
|
32
|
+
load_hash[table] = Dir["#{folder}**/*"].find_all{|f| File.file?(f)}.map{|f| File.expand_path(f)}
|
33
|
+
|
34
|
+
# Check we have some files to load
|
35
|
+
unless load_hash[table].any?
|
36
|
+
raise LoadError, "No files to load in folder #{folder}"
|
37
|
+
end
|
38
|
+
|
39
|
+
# Now we have converted the folder and table
|
40
|
+
# into a map, we can use load_from_map()
|
41
|
+
load_from_hash(load_hash, db, 1, separator, encloser)
|
42
|
+
end
|
43
|
+
module_function :load_from_folder
|
44
|
+
|
45
|
+
# Load Infobright using a hash of
|
46
|
+
# tables to filenames.
|
47
|
+
def load_from_hash(load_hash, db, processes=10, separator='|', encloser='')
|
48
|
+
|
49
|
+
# Check we have some tables
|
50
|
+
t_count = load_hash.length
|
51
|
+
|
52
|
+
# Some validation about the load we're going to do
|
53
|
+
case
|
54
|
+
when t_count == 0
|
55
|
+
raise LoadError, "We have no tables to populate"
|
56
|
+
when t_count < processes
|
57
|
+
puts "We have only #{t_count} table(s) to populate, reducing processes from #{processes} to #{t_count}" # TODO: move to Ruby logger?
|
58
|
+
processes = t_count
|
59
|
+
end
|
60
|
+
|
61
|
+
# Now let's check MySQL server is accessible
|
62
|
+
unless InfobrightLoader::Db.running?(db)
|
63
|
+
raise LoadError, "Default MySQL server cannot be found or is not running"
|
64
|
+
end
|
65
|
+
|
66
|
+
# Now let's check that we can access the database
|
67
|
+
unless InfobrightLoader::Db.db_exists?(db)
|
68
|
+
raise LoadError, "Database #{db.name} cannot be found or user lacks sufficient privileges"
|
69
|
+
end
|
70
|
+
|
71
|
+
# Now we're ready to start with the load - either parallel or serial
|
72
|
+
if t_count == 1
|
73
|
+
table, files = load_hash.first
|
74
|
+
failures = load_table(files, table, db, separator, encloser)
|
75
|
+
else
|
76
|
+
failures = load_parallel(load_hash, db, processes, separator, encloser)
|
77
|
+
# failures = load_serial(load_hash, db, separator, encloser) # For debugging without worrying about threads.
|
78
|
+
end
|
79
|
+
|
80
|
+
failures # Return failures
|
81
|
+
end
|
82
|
+
module_function :load_from_hash
|
83
|
+
|
84
|
+
private
|
85
|
+
|
86
|
+
# Load a single table
|
87
|
+
def load_table(files, table, db, separator, encloser)
|
88
|
+
|
89
|
+
failures = [] # Tables we didn't manage to load
|
90
|
+
|
91
|
+
files.each { |f|
|
92
|
+
puts "Loading file #{f} into table #{db.name}.#{table}" # TODO: move to Ruby logger?
|
93
|
+
begin
|
94
|
+
InfobrightLoader::Db.load_file(f, table, db, separator, encloser)
|
95
|
+
rescue LoadError => le
|
96
|
+
puts "LOAD ERROR: %s" % le # TODO: move to Ruby logger?
|
97
|
+
failures << "%s (%s)" % [f, le]
|
98
|
+
end
|
99
|
+
}
|
100
|
+
failures
|
101
|
+
end
|
102
|
+
module_function :load_table
|
103
|
+
|
104
|
+
# Perform a serial load
|
105
|
+
# Only used for debugging
|
106
|
+
def load_serial(load_hash, db, separator, encloser)
|
107
|
+
|
108
|
+
files_not_loaded = []
|
109
|
+
|
110
|
+
load_hash.keys.each { |k|
|
111
|
+
failures = load_table(load_hash[k], k, db, separator, encloser)
|
112
|
+
unless failures.empty?
|
113
|
+
files_not_loaded.concat failures
|
114
|
+
end
|
115
|
+
}
|
116
|
+
files_not_loaded
|
117
|
+
end
|
118
|
+
module_function :load_serial
|
119
|
+
|
120
|
+
# Perform a parallel load
|
121
|
+
def load_parallel(load_hash, db, processes, separator, encloser)
|
122
|
+
|
123
|
+
tables_to_load = load_hash.keys
|
124
|
+
table = nil
|
125
|
+
threads = []
|
126
|
+
files_not_loaded = []
|
127
|
+
complete = false
|
128
|
+
mutex = Mutex.new
|
129
|
+
|
130
|
+
# If an exception is thrown in a thread that isn't handled, die quickly
|
131
|
+
Thread.abort_on_exception = true
|
132
|
+
|
133
|
+
# Create Ruby threads to concurrently execute Infobright loads
|
134
|
+
for i in (0...processes)
|
135
|
+
|
136
|
+
# Each thread pops a table off the tables_to_load array, and loads files into it.
|
137
|
+
# We loop until there are no more tables to populate.
|
138
|
+
threads << Thread.new do
|
139
|
+
loop do
|
140
|
+
|
141
|
+
# Critical section
|
142
|
+
# Only allow one thread to modify the array at any time
|
143
|
+
mutex.synchronize do
|
144
|
+
if tables_to_load.length == 0
|
145
|
+
complete = true
|
146
|
+
end
|
147
|
+
table = tables_to_load.pop
|
148
|
+
end
|
149
|
+
|
150
|
+
# Let's quit if we have no table to load
|
151
|
+
break if complete # Exit the thread
|
152
|
+
|
153
|
+
# Otherwise let's run through and do all the loads for this table
|
154
|
+
failures = load_table(load_hash[table], table, db, separator, encloser)
|
155
|
+
|
156
|
+
# Also critical: only one thread should update the failures
|
157
|
+
# list at a time
|
158
|
+
mutex.synchronize do
|
159
|
+
unless failures.empty?
|
160
|
+
files_not_loaded.concat failures
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
# Wait for threads to finish
|
169
|
+
threads.each { |aThread| aThread.join }
|
170
|
+
files_not_loaded
|
171
|
+
end
|
172
|
+
module_function :load_parallel
|
173
|
+
|
174
|
+
end
|
175
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
control-file-out.yml
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# Control file for Infobright Ruby Loader control file test
|
2
|
+
#
|
3
|
+
# Is processed with sed to finalise <<x>> variables before
|
4
|
+
# the test is run
|
5
|
+
|
6
|
+
:load:
|
7
|
+
:processes: 3 # Test with 3 processes populating 5 tables
|
8
|
+
:database:
|
9
|
+
:name: wrong # Will override at the command line to irl_tests
|
10
|
+
:username: <<USERNAME>>
|
11
|
+
:password: <<PASSWORD>>
|
12
|
+
:data_format:
|
13
|
+
:separator: \|
|
14
|
+
:encloser: \' # Will override at the command line to "
|
15
|
+
:data_loads:
|
16
|
+
b:
|
17
|
+
- <<PATH>>/data/b/b_1.txt # Bad data
|
18
|
+
- <<PATH>>/data/b/b_2.txt
|
19
|
+
- <<PATH>>/data/b/b_3.txt
|
20
|
+
- <<PATH>>/data/b/b_4.txt
|
21
|
+
- <<PATH>>/data/b/b_5.txt
|
22
|
+
- <<PATH>>/data/b/b_6.txt
|
23
|
+
c:
|
24
|
+
- <<PATH>>/data/c/c_1.txt
|
25
|
+
- <<PATH>>/data/c/c_2.txt
|
26
|
+
- <<PATH>>/data/c/BAD_3.txt
|
27
|
+
- <<PATH>>/data/c/c_4.txt
|
28
|
+
- <<PATH>>/data/c/c_5.txt
|
29
|
+
- <<PATH>>/data/c/c_6.txt
|
30
|
+
d:
|
31
|
+
- <<PATH>>/data/d/BAD_1.txt
|
32
|
+
- <<PATH>>/data/d/d_2.txt
|
33
|
+
- <<PATH>>/data/d/d_3.txt
|
34
|
+
- <<PATH>>/data/d/d_4.txt
|
35
|
+
- <<PATH>>/data/d/d_5.txt # Bad data
|
36
|
+
- <<PATH>>/data/d/d_6.txt
|
37
|
+
e:
|
38
|
+
- <<PATH>>/data/e/e_1.txt
|
39
|
+
- <<PATH>>/data/e/e_2.txt
|
40
|
+
- <<PATH>>/data/e/e_3.txt
|
41
|
+
- <<PATH>>/data/e/BAD_4.txt
|
42
|
+
- <<PATH>>/data/e/e_5.txt
|
43
|
+
- <<PATH>>/data/e/BAD_6.txt
|
44
|
+
f:
|
45
|
+
- <<PATH>>/data/f/f_1.txt
|
46
|
+
- <<PATH>>/data/f/f_2.txt
|
47
|
+
- <<PATH>>/data/f/f_3.txt
|
48
|
+
- <<PATH>>/data/f/f_4.txt
|
49
|
+
- <<PATH>>/data/f/f_5.txt
|
50
|
+
- <<PATH>>/data/f/f_6.txt
|