skydb 0.2.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. data/bin/sky +4 -0
  2. data/lib/skydb.rb +3 -2
  3. data/lib/skydb/action.rb +19 -0
  4. data/lib/skydb/client.rb +15 -5
  5. data/lib/skydb/event.rb +3 -7
  6. data/lib/skydb/import/importer.rb +236 -59
  7. data/lib/skydb/import/transforms/apache.yml +4 -0
  8. data/lib/skydb/import/transforms/sky.yml +20 -12
  9. data/lib/skydb/message.rb +1 -0
  10. data/lib/skydb/message/add_event.rb +1 -1
  11. data/lib/skydb/message/get_actions.rb +4 -0
  12. data/lib/skydb/message/get_properties.rb +4 -0
  13. data/lib/skydb/message/get_tables.rb +43 -0
  14. data/lib/skydb/message/lua/aggregate.rb +4 -0
  15. data/lib/skydb/property.rb +10 -0
  16. data/lib/skydb/query.rb +44 -59
  17. data/lib/skydb/query/after_condition.rb +104 -0
  18. data/lib/skydb/query/{after.rb → condition.rb} +37 -27
  19. data/lib/skydb/query/on_condition.rb +53 -0
  20. data/lib/skydb/query/selection.rb +131 -1
  21. data/lib/skydb/query/selection_field.rb +25 -0
  22. data/lib/skydb/query/selection_group.rb +21 -0
  23. data/lib/skydb/table.rb +7 -0
  24. data/lib/skydb/version.rb +1 -1
  25. data/test/integration/query_test.rb +102 -0
  26. data/test/test_helper.rb +42 -1
  27. data/test/{client_test.rb → unit/client_test.rb} +0 -0
  28. data/test/{event_test.rb → unit/event_test.rb} +0 -5
  29. data/test/unit/import/importer_test.rb +208 -0
  30. data/test/{import → unit/import}/translator_test.rb +0 -0
  31. data/test/{message → unit/message}/add_action_message_test.rb +0 -0
  32. data/test/{message → unit/message}/add_event_message_test.rb +2 -2
  33. data/test/{message → unit/message}/add_property_message_test.rb +0 -0
  34. data/test/{message → unit/message}/create_table_message_test.rb +0 -0
  35. data/test/{message → unit/message}/delete_table_message_test.rb +0 -0
  36. data/test/{message → unit/message}/get_action_message_test.rb +0 -0
  37. data/test/{message → unit/message}/get_actions_message_test.rb +0 -0
  38. data/test/{message → unit/message}/get_properties_message_test.rb +0 -0
  39. data/test/{message → unit/message}/get_property_message_test.rb +0 -0
  40. data/test/{message → unit/message}/get_table_message_test.rb +0 -0
  41. data/test/unit/message/get_tables_message_test.rb +18 -0
  42. data/test/{message → unit/message}/lookup_message_test.rb +0 -0
  43. data/test/{message → unit/message}/lua_aggregate_message_test.rb +0 -0
  44. data/test/{message → unit/message}/multi_message_test.rb +0 -0
  45. data/test/{message → unit/message}/next_action_message_test.rb +0 -0
  46. data/test/{message → unit/message}/ping_message_test.rb +0 -0
  47. data/test/{message_test.rb → unit/message_test.rb} +0 -0
  48. data/test/unit/query/after_test.rb +89 -0
  49. data/test/{query/after_test.rb → unit/query/on_test.rb} +10 -10
  50. data/test/{query → unit/query}/selection_test.rb +2 -2
  51. data/test/{query_test.rb → unit/query_test.rb} +32 -6
  52. data/test/{skydb_test.rb → unit/skydb_test.rb} +0 -0
  53. metadata +165 -53
  54. data/test/import/importer_test.rb +0 -42
data/bin/sky CHANGED
@@ -21,8 +21,10 @@ SkyDB.debug = true
21
21
  command :import do |c|
22
22
  c.syntax = 'sky import FILE'
23
23
  c.description = 'Imports data from a text file into a Sky table.'
24
+ c.option('--processes NUM', 'The number of processes to use.')
24
25
  c.option('--table STRING', 'The name of the table to import to.')
25
26
  c.option('--format STRING', 'The YAML format file to import with.')
27
+ c.option('--file-type STRING', 'The type of file being imported (tsv,json,csv,apache_log).')
26
28
  c.option('--headers STRING', 'A comma-delimited list of headers to use.')
27
29
  c.option('--append', 'Appends to an existing database if one exists.')
28
30
  c.option('--overwrite', 'Overwrites an existing database if one exists.')
@@ -39,6 +41,8 @@ command :import do |c|
39
41
  importer = SkyDB::Import::Importer.new()
40
42
  importer.table_name = options.table || ask("Table: ")
41
43
  importer.headers = options.headers.nil? ? nil : options.headers.split(/,/)
44
+ importer.file_type = options.file_type.nil? ? nil : options.file_type.to_sym
45
+ importer.processes = options.processes.nil? ? 1 : options.processes.to_i
42
46
 
43
47
  # Load transform files by name.
44
48
  formats = options.format || ask("Format: ")
data/lib/skydb.rb CHANGED
@@ -2,6 +2,7 @@ require 'date'
2
2
  require 'msgpack'
3
3
  require 'socket'
4
4
  require 'treetop'
5
+ require 'json'
5
6
 
6
7
  require 'skydb/action'
7
8
  require 'skydb/client'
@@ -40,12 +41,12 @@ class SkyDB
40
41
  :table_name, :table_name=,
41
42
  :multi, :ping, :lookup,
42
43
  :add_event,
43
- :create_table, :delete_table, :get_table,
44
+ :create_table, :delete_table, :get_table, :get_tables,
44
45
  :add_action, :get_action, :get_actions,
45
46
  :add_property, :get_property, :get_properties,
46
47
  :next_actions,
47
48
  :aggregate,
48
- :select
49
+ :query, :select
49
50
  ]
50
51
 
51
52
 
data/lib/skydb/action.rb CHANGED
@@ -53,5 +53,24 @@ class SkyDB
53
53
  def to_msgpack
54
54
  return {id:id, name:name}.to_msgpack
55
55
  end
56
+
57
+ # Serializes the query object into a JSON string.
58
+ def to_json(*a); to_hash.to_json(*a); end
59
+
60
+ # Encodes the action into JSON format.
61
+ def to_hash(*a)
62
+ {
63
+ 'id' => id,
64
+ 'name' => name
65
+ }.delete_if {|k,v| v == '' || v == 0}
66
+ end
67
+
68
+ # Deserializes the selection field object from a hash.
69
+ def from_hash(hash, *a)
70
+ return nil if hash.nil?
71
+ self.id = hash['id'].to_i
72
+ self.name = hash['name']
73
+ return self
74
+ end
56
75
  end
57
76
  end
data/lib/skydb/client.rb CHANGED
@@ -71,9 +71,14 @@ class SkyDB
71
71
  # Retrieves an individual table from the server, if it exists. Otherwise
72
72
  # returns nil.
73
73
  #
74
- # @param [Fixnum] action_id the identifier of the action to retrieve.
75
- def get_table(action_id, options={})
76
- return send_message(SkyDB::Message::GetTable.new(action_id, options))
74
+ # @param [Fixnum] name the table name to retrieve.
75
+ def get_table(name, options={})
76
+ return send_message(SkyDB::Message::GetTable.new(name, options))
77
+ end
78
+
79
+ # Retrieves a list of all tables on the server.
80
+ def get_tables(options={})
81
+ return send_message(SkyDB::Message::GetTables.new(options))
77
82
  end
78
83
 
79
84
 
@@ -190,10 +195,15 @@ class SkyDB
190
195
  ####################################
191
196
 
192
197
  # Starts a query against the database.
198
+ def query()
199
+ return SkyDB::Query.new(:client => self)
200
+ end
201
+
202
+ # Starts a query with a single selection against the database.
193
203
  #
194
- # @param [String] selection a list of properties to select from the database.
204
+ # @param [String] fields a list of properties to select from the database.
195
205
  def select(fields)
196
- return SkyDB::Query.new(:client => self).select(fields)
206
+ return query.select(fields)
197
207
  end
198
208
 
199
209
 
data/lib/skydb/event.rb CHANGED
@@ -25,12 +25,8 @@ class SkyDB
25
25
  # Object ID
26
26
  ##################################
27
27
 
28
- # The numeric identifier of the object that the event is attached to.
29
- attr_reader :object_id
30
-
31
- def object_id=(value)
32
- @object_id = value.to_i
33
- end
28
+ # The object identifier.
29
+ attr_accessor :object_id
34
30
 
35
31
  ##################################
36
32
  # Timestamp
@@ -103,7 +99,7 @@ class SkyDB
103
99
  # Encodes the event into MsgPack format.
104
100
  def to_msgpack
105
101
  obj = {
106
- :objectId => object_id,
102
+ :objectId => object_id.to_msgpack,
107
103
  :timestamp => SkyDB::Timestamp.to_timestamp(timestamp)
108
104
  }
109
105
  obj[:action] = action unless action.nil? || action.empty?
@@ -1,6 +1,12 @@
1
1
  require 'yaml'
2
2
  require 'csv'
3
+ require 'yajl'
4
+ require 'zlib'
5
+ require 'bzip2'
6
+ require 'open-uri'
3
7
  require 'ruby-progressbar'
8
+ require 'apachelogregex'
9
+ require 'useragent'
4
10
 
5
11
  class SkyDB
6
12
  class Import
@@ -11,6 +17,7 @@ class SkyDB
11
17
  #
12
18
  ##########################################################################
13
19
 
20
+ class UnsupportedFileType < StandardError; end
14
21
  class TransformNotFound < StandardError; end
15
22
 
16
23
 
@@ -28,6 +35,7 @@ class SkyDB
28
35
  self.table_name = options[:table_name]
29
36
  self.format = options[:format]
30
37
  self.files = options[:files] || []
38
+ self.processes = options[:processes] || 1
31
39
  end
32
40
 
33
41
 
@@ -37,6 +45,9 @@ class SkyDB
37
45
  #
38
46
  ##########################################################################
39
47
 
48
+ # The number of processes to use.
49
+ attr_accessor :processes
50
+
40
51
  # The client to access the Sky server with.
41
52
  attr_accessor :client
42
53
 
@@ -56,6 +67,10 @@ class SkyDB
56
67
  # treat the CSV input as not having a header row.
57
68
  attr_accessor :headers
58
69
 
70
+ # The file type of file being imported can be one of
71
+ # :csv, :tsv, :json, :apache_log
72
+ attr_accessor :file_type
73
+
59
74
 
60
75
  ##########################################################################
61
76
  #
@@ -67,83 +82,244 @@ class SkyDB
67
82
  # Import
68
83
  ##################################
69
84
 
70
- # Imports the rows from a list of files.
85
+ # Imports records from a list of files.
71
86
  #
72
87
  # @param [Array] a list of files to import.
73
- def import(files)
88
+ def import(files, options={})
74
89
  files = [files] unless files.is_a?(Array)
90
+ options[:progress_bar] = true unless options.has_key?(:progress_bar)
91
+ progress_bar = nil
75
92
 
76
93
  # Set the table to import into.
77
94
  SkyDB.table_name = table_name
78
-
79
- # Loop over each of the files.
80
- files.each do |file|
81
- # Initialize progress bar.
82
- count = %x{wc -l #{file}}.split.first.to_i
83
- progress_bar = ::ProgressBar.create(
84
- :total => count,
85
- :format => ('%-40s' % file) + ' |%B| %P%%'
86
- )
87
95
 
88
- # Determine column separator by extension.
89
- col_sep = ','
90
- if File.extname(file) == '.tsv' || File.extname(file) == '.txt'
91
- col_sep = "\t"
96
+ # Initialize progress bar.
97
+ count = files.inject(0) do |cnt,file|
98
+ # disable progress bar if using compressed files
99
+ if Dir.glob(file).detect{|f|['.gz','.bz2'].include?(File.extname(f).downcase)}
100
+ options[:progress_bar] = false
101
+ break
92
102
  end
103
+ cnt + %x{wc -l #{file}|tail -1}.split.first.to_i
104
+ end
105
+ progress_bar = ::ProgressBar.create(:total => count, :format => '|%B| %P%%') if (options[:progress_bar] and self.processes == 1)
93
106
 
94
- file = File.open(file, 'r')
95
- begin
107
+ # Loop over each of the files.
108
+ files_expanded = files.inject([]) {|fs,fg| fs.concat(Dir[File.expand_path(fg)].delete_if{|f| File.directory?(f)}); fs}
109
+ file_groups =
110
+ if processes > 1
111
+ files_per_group = (files_expanded.size/Float(self.processes)).ceil
112
+ files_expanded.each_slice(files_per_group).to_a
113
+ else
114
+ [files_expanded]
115
+ end
116
+ process_ids = []
117
+
118
+ for i in (0...processes)
119
+ process_ids << fork do
96
120
  SkyDB.multi(:max_count => 1000) do
97
- # Process each line of the CSV file.
98
- CSV.foreach(file, :headers => headers.nil?, :col_sep => col_sep) do |row|
99
- input = {}
100
-
101
- # If headers were not specified then use the ones from the
102
- # CSV file and just convert the row to a hash.
103
- if headers.nil?
104
- input = row.to_hash
105
-
106
- # If headers were specified then manually convert the row
107
- # using the headers provided.
108
- else
109
- headers.each_with_index do |header, index|
110
- input[header] = row[index]
121
+ file_groups[i].each do |file|
122
+ # puts "process[#{i}] -> #{file}"
123
+ each_record(file, options) do |input|
124
+ # Convert input line to a symbolized hash.
125
+ output = translate(input)
126
+ output._symbolize_keys!
127
+
128
+ # p output
129
+
130
+ if output[:object_id].nil?
131
+ progress_bar.clear() unless progress_bar.nil?
132
+ $stderr.puts "[ERROR] Object id required on line #{$.}"
133
+ elsif output[:timestamp].nil?
134
+ progress_bar.clear() unless progress_bar.nil?
135
+ $stderr.puts "[ERROR] Invalid timestamp on line #{$.}"
136
+ else
137
+ # Convert hash to an event and send to Sky.
138
+ event = SkyDB::Event.new(output)
139
+ SkyDB.add_event(event)
111
140
  end
141
+
142
+ # Update progress bar.
143
+ progress_bar.increment() unless progress_bar.nil?
112
144
  end
113
-
114
- # Convert input line to a symbolized hash.
115
- output = translate(input)
116
- output._symbolize_keys!
117
-
118
- # p output
119
-
120
- # Convert hash to an event and send to Sky.
121
- event = SkyDB::Event.new(output)
122
-
123
- if !(event.object_id > 0)
124
- progress_bar.clear()
125
- puts "[ERROR] Invalid object id on line #{$.}."
126
- elsif event.timestamp.nil?
127
- progress_bar.clear()
128
- puts "[ERROR] Invalid timestamp on line #{$.}."
129
- else
130
- SkyDB.add_event(event)
131
- end
132
-
133
- # Update progress bar.
134
- progress_bar.increment()
135
145
  end
136
146
  end
137
- ensure
138
- file.close
139
147
  end
148
+ end
149
+ process_ids.each { |process_id| Process.waitpid(process_id) }
150
+
151
+ # Finish progress bar.
152
+ progress_bar.finish() unless progress_bar.nil? || progress_bar.finished?
153
+
154
+ return nil
155
+ end
156
+
140
157
 
141
- # Finish progress bar.
142
- progress_bar.finish()
158
+ ##################################
159
+ # File Iteration
160
+ ##################################
161
+
162
+ def file_foreach(file, &block)
163
+ case File.extname(file).downcase
164
+ when '.bz2'
165
+ Bzip2::Reader.foreach(file) do |line|
166
+ yield line
167
+ end
168
+ when '.gz'
169
+ Zlib::GzipReader.open(file) do |f|
170
+ f.each_line(file) do |line|
171
+ yield line
172
+ end
173
+ end
174
+ else
175
+ File.foreach(file) do |line|
176
+ yield line
177
+ end
178
+ end
179
+ end
180
+
181
+
182
+ ##################################
183
+ # Iteration
184
+ ##################################
185
+
186
+ # Executes a block for each record in a given file. A record is defined
187
+ # by the file's type (:csv, :tsv, :json).
188
+ #
189
+ # @param [String] file the path to the file to iterate over.
190
+ def each_record(file, options)
191
+ # Determine file type automatically if not passed in.
192
+ if self.file_type.nil?
193
+ self.file_type =
194
+ case File.extname(file)
195
+ when '.tsv' then :tsv
196
+ when '.txt' then :tsv
197
+ when '.json' then :json
198
+ when '.csv' then :csv
199
+ when '.log' then :apache_log
200
+ end
201
+ warn("[import] Determining file type: #{self.file_type || '???'}")
202
+ end
203
+
204
+ # Process the record by file type.
205
+ case self.file_type
206
+ when :csv then each_text_record(file, ",", options, &Proc.new)
207
+ when :tsv then each_text_record(file, "\t", options, &Proc.new)
208
+ when :json then each_json_record(file, options, &Proc.new)
209
+ when :apache_log then each_apache_log_record(file, options, &Proc.new)
210
+ else raise SkyDB::Import::Importer::UnsupportedFileType.new("File type not supported by importer: #{file_type || File.extname(file)}")
143
211
  end
144
212
 
145
213
  return nil
146
214
  end
215
+
216
+ # Executes a block for each line of a delimited flat file format
217
+ # (CSV, TSV).
218
+ #
219
+ # @param [String] file the path to the file to iterate over.
220
+ # @param [String] col_sep the column separator.
221
+ def each_text_record(file, col_sep, options)
222
+ # Process each line of the CSV file.
223
+ CSV.foreach(file, :headers => headers.nil?, :col_sep => col_sep) do |row|
224
+ record = nil
225
+
226
+ # If headers were not specified then use the ones from the
227
+ # CSV file and just convert the row to a hash.
228
+ if headers.nil?
229
+ record = row.to_hash
230
+
231
+ # If headers were specified then manually convert the row
232
+ # using the headers provided.
233
+ else
234
+ record = {}
235
+ headers.each_with_index do |header, index|
236
+ record[header] = row[index]
237
+ end
238
+ end
239
+
240
+ # Skip over blank rows.
241
+ next if record.values.reject{|v| v == '' || v.nil? }.length == 0
242
+
243
+ yield(record)
244
+ end
245
+ end
246
+
247
+ # Executes a block for each line of a JSON file.
248
+ #
249
+ # @param [String] file the path to the file to iterate over.
250
+ def each_json_record(file, options)
251
+ io = open(file)
252
+
253
+ # Process each line of the JSON file.
254
+ Yajl::Parser.parse(io) do |record|
255
+ yield(record)
256
+ end
257
+ end
258
+
259
+ # Executes a block for each line of a standard Apache log file.
260
+ #
261
+ # @param [String] file the path to the file to iterate over.
262
+ def each_apache_log_record(file, options)
263
+ format = options[:format] || '%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
264
+ parser = ApacheLogRegex.new(format)
265
+
266
+ file_foreach(file) do |line|
267
+ begin
268
+ hash = parser.parse!(line)
269
+ m, method, url = *hash['%r'].to_s.match(/^(\w+) ([^ ]+)/)
270
+ uri = URI.parse("http://localhost#{path}") rescue nil
271
+ record = {
272
+ :ip_address => hash['%h'],
273
+ :timestamp => DateTime.strptime(hash['%t'].gsub(/\[|\]/, ''), "%d/%b/%Y:%H:%M:%S %z"),
274
+ :method => method,
275
+ :url => url,
276
+ :status_code => hash['%s'],
277
+ :size => hash['%b'],
278
+ }
279
+ record[:user_identifier] = hash['%l'] unless hash['%l'] == '-'
280
+ record[:user_id] = hash['%u'] unless hash['%u'] == '-'
281
+
282
+ # Extract the parts of the URI.
283
+ if !uri.nil?
284
+ record[:path] = uri.path
285
+ record[:query_string] = uri.query
286
+ record[:query] = CGI::parse(uri.query) rescue {}
287
+ record[:fragment] = uri.fragment
288
+ end
289
+
290
+ # Extract the referrer if there is one.
291
+ if !hash['%{Referer}i'].nil? && hash['%{Referer}i'] != '-'
292
+ record[:referer] = hash['%{Referer}i']
293
+ referer_uri = URI.parse(record[:referer]) rescue nil
294
+ if !referer_uri.nil?
295
+ record[:referer_host] = referer_uri.host
296
+ record[:referer_path] = referer_uri.path
297
+ record[:referer_query_string] = referer_uri.query
298
+ record[:referer_query] = CGI::parse(referer_uri.query) rescue {}
299
+ end
300
+ end
301
+
302
+ # Extract specific user agent information.
303
+ if !hash['%{User-Agent}i'].nil?
304
+ user_agent = UserAgent.parse(hash['%{User-Agent}i'])
305
+ record[:user_agent] = hash['%{User-Agent}i']
306
+ record[:ua_name] = user_agent.browser.to_s unless user_agent.browser.nil?
307
+ record[:ua_version] = user_agent.version.to_s unless user_agent.version.nil?
308
+ record[:ua_platform] = user_agent.platform.to_s unless user_agent.platform.nil?
309
+ record[:ua_os] = user_agent.os.to_s unless user_agent.os.nil?
310
+ record[:ua_mobile] = user_agent.mobile?
311
+ end
312
+
313
+ # Skip junk log entries.
314
+ next if method == "HEAD" || method == "OPTIONS"
315
+
316
+ yield(record)
317
+
318
+ rescue ApacheLogRegex::ParseError => e
319
+ $stderr.puts "[ERROR] Unable to parse line #{$.} in #{file} (#{e.message})"
320
+ end
321
+ end
322
+ end
147
323
 
148
324
 
149
325
  ##################################
@@ -156,12 +332,14 @@ class SkyDB
156
332
  #
157
333
  # @return [Hash] the output hash.
158
334
  def translate(input)
159
- output = {}
335
+ output = {:action => {}, :data => {}}
160
336
 
161
337
  translators.each do |translator|
162
338
  translator.translate(input, output)
163
339
  end
164
340
 
341
+ output.delete(:action) if output[:action].keys.length == 0
342
+ output.delete(:data) if output[:data].keys.length == 0
165
343
  return output
166
344
  end
167
345
 
@@ -202,7 +380,6 @@ class SkyDB
202
380
  # @param [Hash] the hash of transform info.
203
381
  # @param [Array] the path of fields.
204
382
  def load_transform_fields(fields, path=nil)
205
-
206
383
  # Convert each field to a translator.
207
384
  fields.each_pair do |key, value|
208
385
  translator = Translator.new(:output_field => (path.nil? ? key : path.clone.concat([key])))