infobright-loader 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/Gemfile +19 -0
- data/LICENSE-2.0.txt +202 -0
- data/README.md +232 -0
- data/Rakefile +2 -0
- data/bin/infobright-loader +45 -0
- data/control-file/template.yml +23 -0
- data/infobright-loader.gemspec +35 -0
- data/lib/infobright-loader.rb +5 -0
- data/lib/infobright-loader/cli/config.rb +204 -0
- data/lib/infobright-loader/cli/loader.rb +66 -0
- data/lib/infobright-loader/db.rb +90 -0
- data/lib/infobright-loader/loader.rb +175 -0
- data/lib/infobright-loader/version.rb +4 -0
- data/tests/manual/bad-files/.gitignore +1 -0
- data/tests/manual/bad-files/control-file-raw.yml +50 -0
- data/tests/manual/bad-files/data/b/b_1.txt +6 -0
- data/tests/manual/bad-files/data/b/b_2.txt +6 -0
- data/tests/manual/bad-files/data/b/b_3.txt +6 -0
- data/tests/manual/bad-files/data/b/b_4.txt +6 -0
- data/tests/manual/bad-files/data/b/b_5.txt +6 -0
- data/tests/manual/bad-files/data/b/b_6.txt +6 -0
- data/tests/manual/bad-files/data/c/c_1.txt +6 -0
- data/tests/manual/bad-files/data/c/c_2.txt +6 -0
- data/tests/manual/bad-files/data/c/c_3.txt +6 -0
- data/tests/manual/bad-files/data/c/c_4.txt +6 -0
- data/tests/manual/bad-files/data/c/c_5.txt +6 -0
- data/tests/manual/bad-files/data/c/c_6.txt +6 -0
- data/tests/manual/bad-files/data/d/d_1.txt +6 -0
- data/tests/manual/bad-files/data/d/d_2.txt +6 -0
- data/tests/manual/bad-files/data/d/d_3.txt +6 -0
- data/tests/manual/bad-files/data/d/d_4.txt +6 -0
- data/tests/manual/bad-files/data/d/d_5.txt +6 -0
- data/tests/manual/bad-files/data/d/d_6.txt +6 -0
- data/tests/manual/bad-files/data/e/e_1.txt +6 -0
- data/tests/manual/bad-files/data/e/e_2.txt +6 -0
- data/tests/manual/bad-files/data/e/e_3.txt +6 -0
- data/tests/manual/bad-files/data/e/e_4.txt +6 -0
- data/tests/manual/bad-files/data/e/e_5.txt +6 -0
- data/tests/manual/bad-files/data/e/e_6.txt +6 -0
- data/tests/manual/bad-files/data/f/f_1.txt +6 -0
- data/tests/manual/bad-files/data/f/f_2.txt +6 -0
- data/tests/manual/bad-files/data/f/f_3.txt +6 -0
- data/tests/manual/bad-files/data/f/f_4.txt +6 -0
- data/tests/manual/bad-files/data/f/f_5.txt +6 -0
- data/tests/manual/bad-files/data/f/f_6.txt +6 -0
- data/tests/manual/bad-files/run_test.sh +40 -0
- data/tests/manual/bad-files/setup.sql +8 -0
- data/tests/manual/bad-files/verify.sql +10 -0
- data/tests/manual/control-file/.gitignore +1 -0
- data/tests/manual/control-file/control-file-raw.yml +50 -0
- data/tests/manual/control-file/data/b/b_1.txt +6 -0
- data/tests/manual/control-file/data/b/b_2.txt +6 -0
- data/tests/manual/control-file/data/b/b_3.txt +6 -0
- data/tests/manual/control-file/data/b/b_4.txt +6 -0
- data/tests/manual/control-file/data/b/b_5.txt +6 -0
- data/tests/manual/control-file/data/b/b_6.txt +6 -0
- data/tests/manual/control-file/data/c/c_1.txt +6 -0
- data/tests/manual/control-file/data/c/c_2.txt +6 -0
- data/tests/manual/control-file/data/c/c_3.txt +6 -0
- data/tests/manual/control-file/data/c/c_4.txt +6 -0
- data/tests/manual/control-file/data/c/c_5.txt +6 -0
- data/tests/manual/control-file/data/c/c_6.txt +6 -0
- data/tests/manual/control-file/data/d/d_1.txt +6 -0
- data/tests/manual/control-file/data/d/d_2.txt +6 -0
- data/tests/manual/control-file/data/d/d_3.txt +6 -0
- data/tests/manual/control-file/data/d/d_4.txt +6 -0
- data/tests/manual/control-file/data/d/d_5.txt +6 -0
- data/tests/manual/control-file/data/d/d_6.txt +6 -0
- data/tests/manual/control-file/data/e/e_1.txt +6 -0
- data/tests/manual/control-file/data/e/e_2.txt +6 -0
- data/tests/manual/control-file/data/e/e_3.txt +6 -0
- data/tests/manual/control-file/data/e/e_4.txt +6 -0
- data/tests/manual/control-file/data/e/e_5.txt +6 -0
- data/tests/manual/control-file/data/e/e_6.txt +6 -0
- data/tests/manual/control-file/data/f/f_1.txt +6 -0
- data/tests/manual/control-file/data/f/f_2.txt +6 -0
- data/tests/manual/control-file/data/f/f_3.txt +6 -0
- data/tests/manual/control-file/data/f/f_4.txt +6 -0
- data/tests/manual/control-file/data/f/f_5.txt +6 -0
- data/tests/manual/control-file/data/f/f_6.txt +6 -0
- data/tests/manual/control-file/run_test.sh +40 -0
- data/tests/manual/control-file/setup.sql +8 -0
- data/tests/manual/control-file/verify.sql +10 -0
- data/tests/manual/folder/data/a/a_1.txt +6 -0
- data/tests/manual/folder/data/a/a_2.txt +6 -0
- data/tests/manual/folder/data/a/a_3.txt +6 -0
- data/tests/manual/folder/data/a/a_4.txt +6 -0
- data/tests/manual/folder/data/a/a_5.txt +6 -0
- data/tests/manual/folder/data/a/a_6.txt +6 -0
- data/tests/manual/folder/run_test.sh +37 -0
- data/tests/manual/folder/setup.sql +4 -0
- data/tests/manual/folder/verify.sql +2 -0
- metadata +158 -0
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
# Copyright (c) 2012 SnowPlow Analytics Ltd. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# This program is licensed to you under the Apache License Version 2.0,
|
|
4
|
+
# and you may not use this file except in compliance with the Apache License Version 2.0.
|
|
5
|
+
# You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
|
|
6
|
+
#
|
|
7
|
+
# Unless required by applicable law or agreed to in writing,
|
|
8
|
+
# software distributed under the Apache License Version 2.0 is distributed on an
|
|
9
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
|
|
11
|
+
|
|
12
|
+
# Author:: Alex Dean (mailto:support@snowplowanalytics.com)
|
|
13
|
+
# Copyright:: Copyright (c) 2012 SnowPlow Analytics Ltd
|
|
14
|
+
# License:: Apache License Version 2.0
|
|
15
|
+
|
|
16
|
+
require 'thread'
|
|
17
|
+
|
|
18
|
+
require 'infobright-loader/db'
|
|
19
|
+
|
|
20
|
+
module InfobrightLoader
|
|
21
|
+
module Loader
|
|
22
|
+
|
|
23
|
+
# For errors
|
|
24
|
+
class LoadError < ArgumentError; end
|
|
25
|
+
|
|
26
|
+
# Load a single table in Infobright with
|
|
27
|
+
# the contents of a single folder
|
|
28
|
+
def load_from_folder(folder, table, db, separator='|', encloser='')
|
|
29
|
+
|
|
30
|
+
# Let's loop through and grab all absolute paths to all the files in this folder, recursively
|
|
31
|
+
load_hash = {}
|
|
32
|
+
load_hash[table] = Dir["#{folder}**/*"].find_all{|f| File.file?(f)}.map{|f| File.expand_path(f)}
|
|
33
|
+
|
|
34
|
+
# Check we have some files to load
|
|
35
|
+
unless load_hash[table].any?
|
|
36
|
+
raise LoadError, "No files to load in folder #{folder}"
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Now we have converted the folder and table
|
|
40
|
+
# into a map, we can use load_from_map()
|
|
41
|
+
load_from_hash(load_hash, db, 1, separator, encloser)
|
|
42
|
+
end
|
|
43
|
+
module_function :load_from_folder
|
|
44
|
+
|
|
45
|
+
# Load Infobright using a hash of
|
|
46
|
+
# tables to filenames.
|
|
47
|
+
def load_from_hash(load_hash, db, processes=10, separator='|', encloser='')
|
|
48
|
+
|
|
49
|
+
# Check we have some tables
|
|
50
|
+
t_count = load_hash.length
|
|
51
|
+
|
|
52
|
+
# Some validation about the load we're going to do
|
|
53
|
+
case
|
|
54
|
+
when t_count == 0
|
|
55
|
+
raise LoadError, "We have no tables to populate"
|
|
56
|
+
when t_count < processes
|
|
57
|
+
puts "We have only #{t_count} table(s) to populate, reducing processes from #{processes} to #{t_count}" # TODO: move to Ruby logger?
|
|
58
|
+
processes = t_count
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Now let's check MySQL server is accessible
|
|
62
|
+
unless InfobrightLoader::Db.running?(db)
|
|
63
|
+
raise LoadError, "Default MySQL server cannot be found or is not running"
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Now let's check that we can access the database
|
|
67
|
+
unless InfobrightLoader::Db.db_exists?(db)
|
|
68
|
+
raise LoadError, "Database #{db.name} cannot be found or user lacks sufficient privileges"
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Now we're ready to start with the load - either parallel or serial
|
|
72
|
+
if t_count == 1
|
|
73
|
+
table, files = load_hash.first
|
|
74
|
+
failures = load_table(files, table, db, separator, encloser)
|
|
75
|
+
else
|
|
76
|
+
failures = load_parallel(load_hash, db, processes, separator, encloser)
|
|
77
|
+
# failures = load_serial(load_hash, db, separator, encloser) # For debugging without worrying about threads.
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
failures # Return failures
|
|
81
|
+
end
|
|
82
|
+
module_function :load_from_hash
|
|
83
|
+
|
|
84
|
+
private
|
|
85
|
+
|
|
86
|
+
# Load a single table
|
|
87
|
+
def load_table(files, table, db, separator, encloser)
|
|
88
|
+
|
|
89
|
+
failures = [] # Tables we didn't manage to load
|
|
90
|
+
|
|
91
|
+
files.each { |f|
|
|
92
|
+
puts "Loading file #{f} into table #{db.name}.#{table}" # TODO: move to Ruby logger?
|
|
93
|
+
begin
|
|
94
|
+
InfobrightLoader::Db.load_file(f, table, db, separator, encloser)
|
|
95
|
+
rescue LoadError => le
|
|
96
|
+
puts "LOAD ERROR: %s" % le # TODO: move to Ruby logger?
|
|
97
|
+
failures << "%s (%s)" % [f, le]
|
|
98
|
+
end
|
|
99
|
+
}
|
|
100
|
+
failures
|
|
101
|
+
end
|
|
102
|
+
module_function :load_table
|
|
103
|
+
|
|
104
|
+
# Perform a serial load
|
|
105
|
+
# Only used for debugging
|
|
106
|
+
def load_serial(load_hash, db, separator, encloser)
|
|
107
|
+
|
|
108
|
+
files_not_loaded = []
|
|
109
|
+
|
|
110
|
+
load_hash.keys.each { |k|
|
|
111
|
+
failures = load_table(load_hash[k], k, db, separator, encloser)
|
|
112
|
+
unless failures.empty?
|
|
113
|
+
files_not_loaded.concat failures
|
|
114
|
+
end
|
|
115
|
+
}
|
|
116
|
+
files_not_loaded
|
|
117
|
+
end
|
|
118
|
+
module_function :load_serial
|
|
119
|
+
|
|
120
|
+
# Perform a parallel load
|
|
121
|
+
def load_parallel(load_hash, db, processes, separator, encloser)
|
|
122
|
+
|
|
123
|
+
tables_to_load = load_hash.keys
|
|
124
|
+
table = nil
|
|
125
|
+
threads = []
|
|
126
|
+
files_not_loaded = []
|
|
127
|
+
complete = false
|
|
128
|
+
mutex = Mutex.new
|
|
129
|
+
|
|
130
|
+
# If an exception is thrown in a thread that isn't handled, die quickly
|
|
131
|
+
Thread.abort_on_exception = true
|
|
132
|
+
|
|
133
|
+
# Create Ruby threads to concurrently execute Infobright loads
|
|
134
|
+
for i in (0...processes)
|
|
135
|
+
|
|
136
|
+
# Each thread pops a table off the tables_to_load array, and loads files into it.
|
|
137
|
+
# We loop until there are no more tables to populate.
|
|
138
|
+
threads << Thread.new do
|
|
139
|
+
loop do
|
|
140
|
+
|
|
141
|
+
# Critical section
|
|
142
|
+
# Only allow one thread to modify the array at any time
|
|
143
|
+
mutex.synchronize do
|
|
144
|
+
if tables_to_load.length == 0
|
|
145
|
+
complete = true
|
|
146
|
+
end
|
|
147
|
+
table = tables_to_load.pop
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Let's quit if we have no table to load
|
|
151
|
+
break if complete # Exit the thread
|
|
152
|
+
|
|
153
|
+
# Otherwise let's run through and do all the loads for this table
|
|
154
|
+
failures = load_table(load_hash[table], table, db, separator, encloser)
|
|
155
|
+
|
|
156
|
+
# Also critical: only one thread should update the failures
|
|
157
|
+
# list at a time
|
|
158
|
+
mutex.synchronize do
|
|
159
|
+
unless failures.empty?
|
|
160
|
+
files_not_loaded.concat failures
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Wait for threads to finish
|
|
169
|
+
threads.each { |aThread| aThread.join }
|
|
170
|
+
files_not_loaded
|
|
171
|
+
end
|
|
172
|
+
module_function :load_parallel
|
|
173
|
+
|
|
174
|
+
end
|
|
175
|
+
end
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
control-file-out.yml
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# Control file for Infobright Ruby Loader control file test
|
|
2
|
+
#
|
|
3
|
+
# Is processed with sed to finalise <<x>> variables before
|
|
4
|
+
# the test is run
|
|
5
|
+
|
|
6
|
+
:load:
|
|
7
|
+
:processes: 3 # Test with 3 processes populating 5 tables
|
|
8
|
+
:database:
|
|
9
|
+
:name: wrong # Will override at the command line to irl_tests
|
|
10
|
+
:username: <<USERNAME>>
|
|
11
|
+
:password: <<PASSWORD>>
|
|
12
|
+
:data_format:
|
|
13
|
+
:separator: \|
|
|
14
|
+
:encloser: \' # Will override at the command line to "
|
|
15
|
+
:data_loads:
|
|
16
|
+
b:
|
|
17
|
+
- <<PATH>>/data/b/b_1.txt # Bad data
|
|
18
|
+
- <<PATH>>/data/b/b_2.txt
|
|
19
|
+
- <<PATH>>/data/b/b_3.txt
|
|
20
|
+
- <<PATH>>/data/b/b_4.txt
|
|
21
|
+
- <<PATH>>/data/b/b_5.txt
|
|
22
|
+
- <<PATH>>/data/b/b_6.txt
|
|
23
|
+
c:
|
|
24
|
+
- <<PATH>>/data/c/c_1.txt
|
|
25
|
+
- <<PATH>>/data/c/c_2.txt
|
|
26
|
+
- <<PATH>>/data/c/BAD_3.txt
|
|
27
|
+
- <<PATH>>/data/c/c_4.txt
|
|
28
|
+
- <<PATH>>/data/c/c_5.txt
|
|
29
|
+
- <<PATH>>/data/c/c_6.txt
|
|
30
|
+
d:
|
|
31
|
+
- <<PATH>>/data/d/BAD_1.txt
|
|
32
|
+
- <<PATH>>/data/d/d_2.txt
|
|
33
|
+
- <<PATH>>/data/d/d_3.txt
|
|
34
|
+
- <<PATH>>/data/d/d_4.txt
|
|
35
|
+
- <<PATH>>/data/d/d_5.txt # Bad data
|
|
36
|
+
- <<PATH>>/data/d/d_6.txt
|
|
37
|
+
e:
|
|
38
|
+
- <<PATH>>/data/e/e_1.txt
|
|
39
|
+
- <<PATH>>/data/e/e_2.txt
|
|
40
|
+
- <<PATH>>/data/e/e_3.txt
|
|
41
|
+
- <<PATH>>/data/e/BAD_4.txt
|
|
42
|
+
- <<PATH>>/data/e/e_5.txt
|
|
43
|
+
- <<PATH>>/data/e/BAD_6.txt
|
|
44
|
+
f:
|
|
45
|
+
- <<PATH>>/data/f/f_1.txt
|
|
46
|
+
- <<PATH>>/data/f/f_2.txt
|
|
47
|
+
- <<PATH>>/data/f/f_3.txt
|
|
48
|
+
- <<PATH>>/data/f/f_4.txt
|
|
49
|
+
- <<PATH>>/data/f/f_5.txt
|
|
50
|
+
- <<PATH>>/data/f/f_6.txt
|