skydb 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. data/bin/sky +4 -0
  2. data/lib/skydb.rb +3 -2
  3. data/lib/skydb/action.rb +19 -0
  4. data/lib/skydb/client.rb +15 -5
  5. data/lib/skydb/event.rb +3 -7
  6. data/lib/skydb/import/importer.rb +236 -59
  7. data/lib/skydb/import/transforms/apache.yml +4 -0
  8. data/lib/skydb/import/transforms/sky.yml +20 -12
  9. data/lib/skydb/message.rb +1 -0
  10. data/lib/skydb/message/add_event.rb +1 -1
  11. data/lib/skydb/message/get_actions.rb +4 -0
  12. data/lib/skydb/message/get_properties.rb +4 -0
  13. data/lib/skydb/message/get_tables.rb +43 -0
  14. data/lib/skydb/message/lua/aggregate.rb +4 -0
  15. data/lib/skydb/property.rb +10 -0
  16. data/lib/skydb/query.rb +44 -59
  17. data/lib/skydb/query/after_condition.rb +104 -0
  18. data/lib/skydb/query/{after.rb → condition.rb} +37 -27
  19. data/lib/skydb/query/on_condition.rb +53 -0
  20. data/lib/skydb/query/selection.rb +131 -1
  21. data/lib/skydb/query/selection_field.rb +25 -0
  22. data/lib/skydb/query/selection_group.rb +21 -0
  23. data/lib/skydb/table.rb +7 -0
  24. data/lib/skydb/version.rb +1 -1
  25. data/test/integration/query_test.rb +102 -0
  26. data/test/test_helper.rb +42 -1
  27. data/test/{client_test.rb → unit/client_test.rb} +0 -0
  28. data/test/{event_test.rb → unit/event_test.rb} +0 -5
  29. data/test/unit/import/importer_test.rb +208 -0
  30. data/test/{import → unit/import}/translator_test.rb +0 -0
  31. data/test/{message → unit/message}/add_action_message_test.rb +0 -0
  32. data/test/{message → unit/message}/add_event_message_test.rb +2 -2
  33. data/test/{message → unit/message}/add_property_message_test.rb +0 -0
  34. data/test/{message → unit/message}/create_table_message_test.rb +0 -0
  35. data/test/{message → unit/message}/delete_table_message_test.rb +0 -0
  36. data/test/{message → unit/message}/get_action_message_test.rb +0 -0
  37. data/test/{message → unit/message}/get_actions_message_test.rb +0 -0
  38. data/test/{message → unit/message}/get_properties_message_test.rb +0 -0
  39. data/test/{message → unit/message}/get_property_message_test.rb +0 -0
  40. data/test/{message → unit/message}/get_table_message_test.rb +0 -0
  41. data/test/unit/message/get_tables_message_test.rb +18 -0
  42. data/test/{message → unit/message}/lookup_message_test.rb +0 -0
  43. data/test/{message → unit/message}/lua_aggregate_message_test.rb +0 -0
  44. data/test/{message → unit/message}/multi_message_test.rb +0 -0
  45. data/test/{message → unit/message}/next_action_message_test.rb +0 -0
  46. data/test/{message → unit/message}/ping_message_test.rb +0 -0
  47. data/test/{message_test.rb → unit/message_test.rb} +0 -0
  48. data/test/unit/query/after_test.rb +89 -0
  49. data/test/{query/after_test.rb → unit/query/on_test.rb} +10 -10
  50. data/test/{query → unit/query}/selection_test.rb +2 -2
  51. data/test/{query_test.rb → unit/query_test.rb} +32 -6
  52. data/test/{skydb_test.rb → unit/skydb_test.rb} +0 -0
  53. metadata +165 -53
  54. data/test/import/importer_test.rb +0 -42
data/bin/sky CHANGED
@@ -21,8 +21,10 @@ SkyDB.debug = true
21
21
  command :import do |c|
22
22
  c.syntax = 'sky import FILE'
23
23
  c.description = 'Imports data from a text file into a Sky table.'
24
+ c.option('--processes NUM', 'The number of processes to use.')
24
25
  c.option('--table STRING', 'The name of the table to import to.')
25
26
  c.option('--format STRING', 'The YAML format file to import with.')
27
+ c.option('--file-type STRING', 'The type of file being imported (tsv,json,csv,apache_log).')
26
28
  c.option('--headers STRING', 'A comma-delimited list of headers to use.')
27
29
  c.option('--append', 'Appends to an existing database if one exists.')
28
30
  c.option('--overwrite', 'Overwrites an existing database if one exists.')
@@ -39,6 +41,8 @@ command :import do |c|
39
41
  importer = SkyDB::Import::Importer.new()
40
42
  importer.table_name = options.table || ask("Table: ")
41
43
  importer.headers = options.headers.nil? ? nil : options.headers.split(/,/)
44
+ importer.file_type = options.file_type.nil? ? nil : options.file_type.to_sym
45
+ importer.processes = options.processes.nil? ? 1 : options.processes.to_i
42
46
 
43
47
  # Load transform files by name.
44
48
  formats = options.format || ask("Format: ")
data/lib/skydb.rb CHANGED
@@ -2,6 +2,7 @@ require 'date'
2
2
  require 'msgpack'
3
3
  require 'socket'
4
4
  require 'treetop'
5
+ require 'json'
5
6
 
6
7
  require 'skydb/action'
7
8
  require 'skydb/client'
@@ -40,12 +41,12 @@ class SkyDB
40
41
  :table_name, :table_name=,
41
42
  :multi, :ping, :lookup,
42
43
  :add_event,
43
- :create_table, :delete_table, :get_table,
44
+ :create_table, :delete_table, :get_table, :get_tables,
44
45
  :add_action, :get_action, :get_actions,
45
46
  :add_property, :get_property, :get_properties,
46
47
  :next_actions,
47
48
  :aggregate,
48
- :select
49
+ :query, :select
49
50
  ]
50
51
 
51
52
 
data/lib/skydb/action.rb CHANGED
@@ -53,5 +53,24 @@ class SkyDB
53
53
  def to_msgpack
54
54
  return {id:id, name:name}.to_msgpack
55
55
  end
56
+
57
+ # Serializes the query object into a JSON string.
58
+ def to_json(*a); to_hash.to_json(*a); end
59
+
60
+ # Encodes the action into JSON format.
61
+ def to_hash(*a)
62
+ {
63
+ 'id' => id,
64
+ 'name' => name
65
+ }.delete_if {|k,v| v == '' || v == 0}
66
+ end
67
+
68
+ # Deserializes the selection field object from a hash.
69
+ def from_hash(hash, *a)
70
+ return nil if hash.nil?
71
+ self.id = hash['id'].to_i
72
+ self.name = hash['name']
73
+ return self
74
+ end
56
75
  end
57
76
  end
data/lib/skydb/client.rb CHANGED
@@ -71,9 +71,14 @@ class SkyDB
71
71
  # Retrieves an individual table from the server, if it exists. Otherwise
72
72
  # returns nil.
73
73
  #
74
- # @param [Fixnum] action_id the identifier of the action to retrieve.
75
- def get_table(action_id, options={})
76
- return send_message(SkyDB::Message::GetTable.new(action_id, options))
74
+ # @param [Fixnum] name the table name to retrieve.
75
+ def get_table(name, options={})
76
+ return send_message(SkyDB::Message::GetTable.new(name, options))
77
+ end
78
+
79
+ # Retrieves a list of all tables on the server.
80
+ def get_tables(options={})
81
+ return send_message(SkyDB::Message::GetTables.new(options))
77
82
  end
78
83
 
79
84
 
@@ -190,10 +195,15 @@ class SkyDB
190
195
  ####################################
191
196
 
192
197
  # Starts a query against the database.
198
+ def query()
199
+ return SkyDB::Query.new(:client => self)
200
+ end
201
+
202
+ # Starts a query with a single selection against the database.
193
203
  #
194
- # @param [String] selection a list of properties to select from the database.
204
+ # @param [String] fields a list of properties to select from the database.
195
205
  def select(fields)
196
- return SkyDB::Query.new(:client => self).select(fields)
206
+ return query.select(fields)
197
207
  end
198
208
 
199
209
 
data/lib/skydb/event.rb CHANGED
@@ -25,12 +25,8 @@ class SkyDB
25
25
  # Object ID
26
26
  ##################################
27
27
 
28
- # The numeric identifier of the object that the event is attached to.
29
- attr_reader :object_id
30
-
31
- def object_id=(value)
32
- @object_id = value.to_i
33
- end
28
+ # The object identifier.
29
+ attr_accessor :object_id
34
30
 
35
31
  ##################################
36
32
  # Timestamp
@@ -103,7 +99,7 @@ class SkyDB
103
99
  # Encodes the event into MsgPack format.
104
100
  def to_msgpack
105
101
  obj = {
106
- :objectId => object_id,
102
+ :objectId => object_id.to_msgpack,
107
103
  :timestamp => SkyDB::Timestamp.to_timestamp(timestamp)
108
104
  }
109
105
  obj[:action] = action unless action.nil? || action.empty?
@@ -1,6 +1,12 @@
1
1
  require 'yaml'
2
2
  require 'csv'
3
+ require 'yajl'
4
+ require 'zlib'
5
+ require 'bzip2'
6
+ require 'open-uri'
3
7
  require 'ruby-progressbar'
8
+ require 'apachelogregex'
9
+ require 'useragent'
4
10
 
5
11
  class SkyDB
6
12
  class Import
@@ -11,6 +17,7 @@ class SkyDB
11
17
  #
12
18
  ##########################################################################
13
19
 
20
+ class UnsupportedFileType < StandardError; end
14
21
  class TransformNotFound < StandardError; end
15
22
 
16
23
 
@@ -28,6 +35,7 @@ class SkyDB
28
35
  self.table_name = options[:table_name]
29
36
  self.format = options[:format]
30
37
  self.files = options[:files] || []
38
+ self.processes = options[:processes] || 1
31
39
  end
32
40
 
33
41
 
@@ -37,6 +45,9 @@ class SkyDB
37
45
  #
38
46
  ##########################################################################
39
47
 
48
+ # The number of processes to use.
49
+ attr_accessor :processes
50
+
40
51
  # The client to access the Sky server with.
41
52
  attr_accessor :client
42
53
 
@@ -56,6 +67,10 @@ class SkyDB
56
67
  # treat the CSV input as not having a header row.
57
68
  attr_accessor :headers
58
69
 
70
+ # The file type of file being imported can be one of
71
+ # :csv, :tsv, :json, :apache_log
72
+ attr_accessor :file_type
73
+
59
74
 
60
75
  ##########################################################################
61
76
  #
@@ -67,83 +82,244 @@ class SkyDB
67
82
  # Import
68
83
  ##################################
69
84
 
70
- # Imports the rows from a list of files.
85
+ # Imports records from a list of files.
71
86
  #
72
87
  # @param [Array] a list of files to import.
73
- def import(files)
88
+ def import(files, options={})
74
89
  files = [files] unless files.is_a?(Array)
90
+ options[:progress_bar] = true unless options.has_key?(:progress_bar)
91
+ progress_bar = nil
75
92
 
76
93
  # Set the table to import into.
77
94
  SkyDB.table_name = table_name
78
-
79
- # Loop over each of the files.
80
- files.each do |file|
81
- # Initialize progress bar.
82
- count = %x{wc -l #{file}}.split.first.to_i
83
- progress_bar = ::ProgressBar.create(
84
- :total => count,
85
- :format => ('%-40s' % file) + ' |%B| %P%%'
86
- )
87
95
 
88
- # Determine column separator by extension.
89
- col_sep = ','
90
- if File.extname(file) == '.tsv' || File.extname(file) == '.txt'
91
- col_sep = "\t"
96
+ # Initialize progress bar.
97
+ count = files.inject(0) do |cnt,file|
98
+ # disable progress bar if using compressed files
99
+ if Dir.glob(file).detect{|f|['.gz','.bz2'].include?(File.extname(f).downcase)}
100
+ options[:progress_bar] = false
101
+ break
92
102
  end
103
+ cnt + %x{wc -l #{file}|tail -1}.split.first.to_i
104
+ end
105
+ progress_bar = ::ProgressBar.create(:total => count, :format => '|%B| %P%%') if (options[:progress_bar] and self.processes == 1)
93
106
 
94
- file = File.open(file, 'r')
95
- begin
107
+ # Loop over each of the files.
108
+ files_expanded = files.inject([]) {|fs,fg| fs.concat(Dir[File.expand_path(fg)].delete_if{|f| File.directory?(f)}); fs}
109
+ file_groups =
110
+ if processes > 1
111
+ files_per_group = (files_expanded.size/Float(self.processes)).ceil
112
+ files_expanded.each_slice(files_per_group).to_a
113
+ else
114
+ [files_expanded]
115
+ end
116
+ process_ids = []
117
+
118
+ for i in (0...processes)
119
+ process_ids << fork do
96
120
  SkyDB.multi(:max_count => 1000) do
97
- # Process each line of the CSV file.
98
- CSV.foreach(file, :headers => headers.nil?, :col_sep => col_sep) do |row|
99
- input = {}
100
-
101
- # If headers were not specified then use the ones from the
102
- # CSV file and just convert the row to a hash.
103
- if headers.nil?
104
- input = row.to_hash
105
-
106
- # If headers were specified then manually convert the row
107
- # using the headers provided.
108
- else
109
- headers.each_with_index do |header, index|
110
- input[header] = row[index]
121
+ file_groups[i].each do |file|
122
+ # puts "process[#{i}] -> #{file}"
123
+ each_record(file, options) do |input|
124
+ # Convert input line to a symbolized hash.
125
+ output = translate(input)
126
+ output._symbolize_keys!
127
+
128
+ # p output
129
+
130
+ if output[:object_id].nil?
131
+ progress_bar.clear() unless progress_bar.nil?
132
+ $stderr.puts "[ERROR] Object id required on line #{$.}"
133
+ elsif output[:timestamp].nil?
134
+ progress_bar.clear() unless progress_bar.nil?
135
+ $stderr.puts "[ERROR] Invalid timestamp on line #{$.}"
136
+ else
137
+ # Convert hash to an event and send to Sky.
138
+ event = SkyDB::Event.new(output)
139
+ SkyDB.add_event(event)
111
140
  end
141
+
142
+ # Update progress bar.
143
+ progress_bar.increment() unless progress_bar.nil?
112
144
  end
113
-
114
- # Convert input line to a symbolized hash.
115
- output = translate(input)
116
- output._symbolize_keys!
117
-
118
- # p output
119
-
120
- # Convert hash to an event and send to Sky.
121
- event = SkyDB::Event.new(output)
122
-
123
- if !(event.object_id > 0)
124
- progress_bar.clear()
125
- puts "[ERROR] Invalid object id on line #{$.}."
126
- elsif event.timestamp.nil?
127
- progress_bar.clear()
128
- puts "[ERROR] Invalid timestamp on line #{$.}."
129
- else
130
- SkyDB.add_event(event)
131
- end
132
-
133
- # Update progress bar.
134
- progress_bar.increment()
135
145
  end
136
146
  end
137
- ensure
138
- file.close
139
147
  end
148
+ end
149
+ process_ids.each { |process_id| Process.waitpid(process_id) }
150
+
151
+ # Finish progress bar.
152
+ progress_bar.finish() unless progress_bar.nil? || progress_bar.finished?
153
+
154
+ return nil
155
+ end
156
+
140
157
 
141
- # Finish progress bar.
142
- progress_bar.finish()
158
+ ##################################
159
+ # File Iteration
160
+ ##################################
161
+
162
+ def file_foreach(file, &block)
163
+ case File.extname(file).downcase
164
+ when '.bz2'
165
+ Bzip2::Reader.foreach(file) do |line|
166
+ yield line
167
+ end
168
+ when '.gz'
169
+ Zlib::GzipReader.open(file) do |f|
170
+ f.each_line(file) do |line|
171
+ yield line
172
+ end
173
+ end
174
+ else
175
+ File.foreach(file) do |line|
176
+ yield line
177
+ end
178
+ end
179
+ end
180
+
181
+
182
+ ##################################
183
+ # Iteration
184
+ ##################################
185
+
186
+ # Executes a block for each record in a given file. A record is defined
187
+ # by the file's type (:csv, :tsv, :json).
188
+ #
189
+ # @param [String] file the path to the file to iterate over.
190
+ def each_record(file, options)
191
+ # Determine file type automatically if not passed in.
192
+ if self.file_type.nil?
193
+ self.file_type =
194
+ case File.extname(file)
195
+ when '.tsv' then :tsv
196
+ when '.txt' then :tsv
197
+ when '.json' then :json
198
+ when '.csv' then :csv
199
+ when '.log' then :apache_log
200
+ end
201
+ warn("[import] Determining file type: #{self.file_type || '???'}")
202
+ end
203
+
204
+ # Process the record by file type.
205
+ case self.file_type
206
+ when :csv then each_text_record(file, ",", options, &Proc.new)
207
+ when :tsv then each_text_record(file, "\t", options, &Proc.new)
208
+ when :json then each_json_record(file, options, &Proc.new)
209
+ when :apache_log then each_apache_log_record(file, options, &Proc.new)
210
+ else raise SkyDB::Import::Importer::UnsupportedFileType.new("File type not supported by importer: #{file_type || File.extname(file)}")
143
211
  end
144
212
 
145
213
  return nil
146
214
  end
215
+
216
+ # Executes a block for each line of a delimited flat file format
217
+ # (CSV, TSV).
218
+ #
219
+ # @param [String] file the path to the file to iterate over.
220
+ # @param [String] col_sep the column separator.
221
+ def each_text_record(file, col_sep, options)
222
+ # Process each line of the CSV file.
223
+ CSV.foreach(file, :headers => headers.nil?, :col_sep => col_sep) do |row|
224
+ record = nil
225
+
226
+ # If headers were not specified then use the ones from the
227
+ # CSV file and just convert the row to a hash.
228
+ if headers.nil?
229
+ record = row.to_hash
230
+
231
+ # If headers were specified then manually convert the row
232
+ # using the headers provided.
233
+ else
234
+ record = {}
235
+ headers.each_with_index do |header, index|
236
+ record[header] = row[index]
237
+ end
238
+ end
239
+
240
+ # Skip over blank rows.
241
+ next if record.values.reject{|v| v == '' || v.nil? }.length == 0
242
+
243
+ yield(record)
244
+ end
245
+ end
246
+
247
+ # Executes a block for each line of a JSON file.
248
+ #
249
+ # @param [String] file the path to the file to iterate over.
250
+ def each_json_record(file, options)
251
+ io = open(file)
252
+
253
+ # Process each line of the JSON file.
254
+ Yajl::Parser.parse(io) do |record|
255
+ yield(record)
256
+ end
257
+ end
258
+
259
+ # Executes a block for each line of a standard Apache log file.
260
+ #
261
+ # @param [String] file the path to the file to iterate over.
262
+ def each_apache_log_record(file, options)
263
+ format = options[:format] || '%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
264
+ parser = ApacheLogRegex.new(format)
265
+
266
+ file_foreach(file) do |line|
267
+ begin
268
+ hash = parser.parse!(line)
269
+ m, method, url = *hash['%r'].to_s.match(/^(\w+) ([^ ]+)/)
270
+ uri = URI.parse("http://localhost#{path}") rescue nil
271
+ record = {
272
+ :ip_address => hash['%h'],
273
+ :timestamp => DateTime.strptime(hash['%t'].gsub(/\[|\]/, ''), "%d/%b/%Y:%H:%M:%S %z"),
274
+ :method => method,
275
+ :url => url,
276
+ :status_code => hash['%s'],
277
+ :size => hash['%b'],
278
+ }
279
+ record[:user_identifier] = hash['%l'] unless hash['%l'] == '-'
280
+ record[:user_id] = hash['%u'] unless hash['%u'] == '-'
281
+
282
+ # Extract the parts of the URI.
283
+ if !uri.nil?
284
+ record[:path] = uri.path
285
+ record[:query_string] = uri.query
286
+ record[:query] = CGI::parse(uri.query) rescue {}
287
+ record[:fragment] = uri.fragment
288
+ end
289
+
290
+ # Extract the referrer if there is one.
291
+ if !hash['%{Referer}i'].nil? && hash['%{Referer}i'] != '-'
292
+ record[:referer] = hash['%{Referer}i']
293
+ referer_uri = URI.parse(record[:referer]) rescue nil
294
+ if !referer_uri.nil?
295
+ record[:referer_host] = referer_uri.host
296
+ record[:referer_path] = referer_uri.path
297
+ record[:referer_query_string] = referer_uri.query
298
+ record[:referer_query] = CGI::parse(referer_uri.query) rescue {}
299
+ end
300
+ end
301
+
302
+ # Extract specific user agent information.
303
+ if !hash['%{User-Agent}i'].nil?
304
+ user_agent = UserAgent.parse(hash['%{User-Agent}i'])
305
+ record[:user_agent] = hash['%{User-Agent}i']
306
+ record[:ua_name] = user_agent.browser.to_s unless user_agent.browser.nil?
307
+ record[:ua_version] = user_agent.version.to_s unless user_agent.version.nil?
308
+ record[:ua_platform] = user_agent.platform.to_s unless user_agent.platform.nil?
309
+ record[:ua_os] = user_agent.os.to_s unless user_agent.os.nil?
310
+ record[:ua_mobile] = user_agent.mobile?
311
+ end
312
+
313
+ # Skip junk log entries.
314
+ next if method == "HEAD" || method == "OPTIONS"
315
+
316
+ yield(record)
317
+
318
+ rescue ApacheLogRegex::ParseError => e
319
+ $stderr.puts "[ERROR] Unable to parse line #{$.} in #{file} (#{e.message})"
320
+ end
321
+ end
322
+ end
147
323
 
148
324
 
149
325
  ##################################
@@ -156,12 +332,14 @@ class SkyDB
156
332
  #
157
333
  # @return [Hash] the output hash.
158
334
  def translate(input)
159
- output = {}
335
+ output = {:action => {}, :data => {}}
160
336
 
161
337
  translators.each do |translator|
162
338
  translator.translate(input, output)
163
339
  end
164
340
 
341
+ output.delete(:action) if output[:action].keys.length == 0
342
+ output.delete(:data) if output[:data].keys.length == 0
165
343
  return output
166
344
  end
167
345
 
@@ -202,7 +380,6 @@ class SkyDB
202
380
  # @param [Hash] the hash of transform info.
203
381
  # @param [Array] the path of fields.
204
382
  def load_transform_fields(fields, path=nil)
205
-
206
383
  # Convert each field to a translator.
207
384
  fields.each_pair do |key, value|
208
385
  translator = Translator.new(:output_field => (path.nil? ? key : path.clone.concat([key])))