lwac 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +70 -0
  3. data/README.md +31 -0
  4. data/bin/lwac +132 -0
  5. data/client_config.md +71 -0
  6. data/concepts.md +70 -0
  7. data/config_docs.md +40 -0
  8. data/doc/compile.rb +52 -0
  9. data/doc/template.rhtml +145 -0
  10. data/example_config/client.jv.yml +33 -0
  11. data/example_config/client.yml +34 -0
  12. data/example_config/export.yml +70 -0
  13. data/example_config/import.yml +19 -0
  14. data/example_config/server.yml +97 -0
  15. data/export_config.md +448 -0
  16. data/import_config.md +29 -0
  17. data/index.md +49 -0
  18. data/install.md +29 -0
  19. data/lib/lwac.rb +17 -0
  20. data/lib/lwac/client.rb +354 -0
  21. data/lib/lwac/client/file_cache.rb +160 -0
  22. data/lib/lwac/client/storage.rb +69 -0
  23. data/lib/lwac/export.rb +362 -0
  24. data/lib/lwac/export/format.rb +310 -0
  25. data/lib/lwac/export/key_value_format.rb +132 -0
  26. data/lib/lwac/export/resources.rb +82 -0
  27. data/lib/lwac/import.rb +152 -0
  28. data/lib/lwac/server.rb +294 -0
  29. data/lib/lwac/server/consistency_manager.rb +265 -0
  30. data/lib/lwac/server/db_conn.rb +376 -0
  31. data/lib/lwac/server/storage_manager.rb +290 -0
  32. data/lib/lwac/shared/data_types.rb +283 -0
  33. data/lib/lwac/shared/identity.rb +44 -0
  34. data/lib/lwac/shared/launch_tools.rb +87 -0
  35. data/lib/lwac/shared/multilog.rb +158 -0
  36. data/lib/lwac/shared/serialiser.rb +86 -0
  37. data/limits.md +114 -0
  38. data/log_config.md +30 -0
  39. data/monitoring.md +13 -0
  40. data/resources/schemata/mysql/links.sql +7 -0
  41. data/resources/schemata/sqlite/links.sql +5 -0
  42. data/server_config.md +242 -0
  43. data/tools.md +89 -0
  44. data/workflows.md +39 -0
  45. metadata +140 -0
@@ -0,0 +1,160 @@
1
+
2
+ require 'fileutils'
3
+ require 'thread'
4
+
5
+ module LWAC
6
+ class FileCache
7
+
8
+ def initialize(filename, max_size = nil)
9
+ # thread safety
10
+ @mutex = Mutex.new
11
+
12
+ raise "No filename given" if filename == nil
13
+ @filename = filename
14
+ reset # pullup
15
+
16
+ # index system for lookup
17
+ @index = {}
18
+ @orphan_keys = []
19
+
20
+ # TODO: Max size in bytes
21
+ # @max_filesize = max_size
22
+ end
23
+
24
+ # read a value
25
+ def [](key)
26
+ @mutex.synchronize{
27
+ return if not @index.include?(key)
28
+
29
+ @file.seek( @index[key][:start] )
30
+ return Marshal.load( @file.read( @index[key][:len] ) )
31
+ }
32
+ end
33
+
34
+ # Write a value
35
+ def []=(key, value)
36
+ @mutex.synchronize{
37
+ # keep record of the old version if already a value
38
+ delete_from_index(key) if @index[key]
39
+
40
+ # Keep a note of where we're writing
41
+ @index[key] = {:start => @end_of_file}
42
+
43
+ # Write
44
+ @file.seek(@end_of_file)
45
+ @file.write( Marshal.dump(value) )
46
+ @file.flush
47
+ @end_of_file = @file.pos
48
+
49
+ # then read off position as a length
50
+ @index[key][:len] = @end_of_file - @index[key][:start]
51
+ }
52
+ end
53
+
54
+ # Wipe the store entirely
55
+ def wipe
56
+ @mutex.synchronize{
57
+ @file.close if @file and not @file.closed?
58
+ FileUtils.rm(@filename) if File.exist?(@filename)
59
+ @file = File.open(@filename, 'wb+')
60
+ @end_of_file = 0
61
+ }
62
+ end
63
+ alias :delete_all :wipe
64
+ alias :reset :wipe
65
+
66
+ # Remove something from the index
67
+ def delete_from_index(key)
68
+ @mutex.synchronize{
69
+ @orphan_keys << {:key => key, :value => @index.delete(key)} if @index.include?(key)
70
+ }
71
+ end
72
+
73
+ def keys
74
+ @mutex.synchronize{
75
+ @index.keys
76
+ }
77
+ end
78
+
79
+ # Read orphan keys
80
+ # norably non-unique.
81
+ def orphan_keys
82
+ @mutex.synchronize{
83
+ @orphan_keys.map{|o| o[:key] }
84
+ }
85
+ end
86
+
87
+ # Enable sync mode
88
+ def sync=(s)
89
+ @mutex.synchronize{
90
+ @file.sync = s
91
+ }
92
+ end
93
+
94
+ # Status of sync mode
95
+ def sync
96
+ @mutex.synchronize{
97
+ @file.sync
98
+ }
99
+ end
100
+
101
+ # Flush to disk
102
+ def flush
103
+ @mutex.synchronize{
104
+ @file.flush
105
+ }
106
+ end
107
+
108
+ # Loop over each key
109
+ def each_key(&block)
110
+ @mutex.synchronize{
111
+ @index.each_key{|k| yield(k) }
112
+ }
113
+ end
114
+
115
+ # How many items
116
+ def length
117
+ @mutex.synchronize{
118
+ @index.length
119
+ }
120
+ end
121
+
122
+ def empty?
123
+ length == 0
124
+ end
125
+
126
+ # filesize in bytes
127
+ def filesize
128
+ @end_of_file
129
+ end
130
+
131
+ # Close and remove file
132
+ def close
133
+ @mutex.synchronize{
134
+ @file.close
135
+ FileUtils.rm(@filename)
136
+ }
137
+ end
138
+
139
+ # Currently closed?
140
+ def closed?
141
+ @mutex.synchronize{
142
+ @file.closed?
143
+ }
144
+ end
145
+ end
146
+ #
147
+ # if __FILE__ == $0 then
148
+ # # create new store
149
+ # x = FileCache.new("test")
150
+ #
151
+ # 100000.times{|i|
152
+ # x[i] = i
153
+ # }
154
+ #
155
+ # puts "x[20] = #{x[20]}"
156
+ #
157
+ # x.close
158
+ # end
159
+ #
160
+ end
@@ -0,0 +1,69 @@
1
+ # Storage/cache library for clients
2
+ # This is a simple key-value store either on disk or in memory, designed for storing datapoints before they're shipped off to the server
3
+
4
+ require 'fileutils'
5
+
6
+ module LWAC
7
+
8
+ class Store
9
+ # Create a new store with a given file.
10
+ #
11
+ # If a filepath is given, PStore is used for on-disk, persistent storage.
12
+ # if thread_safe is true then:
13
+ # - Hashes will be made thread-safe
14
+ # - PStores will be switched to thread-safe mode
15
+ def initialize(filepath=nil)
16
+ # Create a mutex if using a hash
17
+ @mutex = Mutex.new
18
+
19
+ if filepath == nil or filepath.to_s == ""
20
+ @store = Hash.new
21
+ @type = :hash
22
+ else
23
+ @store = FileCache.new(filepath)
24
+ @type = :file
25
+ end
26
+ end
27
+
28
+ # ---------------------------------------------------------------------------
29
+ # Method_missing handles most things...
30
+
31
+ def method_missing(m, *args, &block)
32
+ @store.send(m, *args, &block)
33
+ rescue NoMethodError => e
34
+ super
35
+ end
36
+
37
+ # Handle disparity between APIs
38
+ # ---------------------------------------------------------------------------
39
+
40
+ # Closes the file system, missing from Hash
41
+ def close
42
+ return if @type == :hash
43
+ @store.close
44
+ end
45
+
46
+ def delete_from_index(key)
47
+ if @type == :hash
48
+ @mutex.synchronize{
49
+ return @store.delete(key)
50
+ }
51
+ end
52
+ @store.delete_from_index(key)
53
+ end
54
+
55
+ # Removes all items
56
+ def delete_all
57
+ # GC's probably quicker than looping and removing stuff
58
+ if @type == :hash
59
+ @mutex.synchronize{
60
+ @store = Hash.new
61
+ }
62
+ else
63
+ @store.delete_all
64
+ end
65
+ end
66
+ end
67
+
68
+
69
+ end
@@ -0,0 +1,362 @@
1
+
2
+ require 'lwac/server/storage_manager'
3
+ require 'lwac/export/resources'
4
+ require 'lwac/export/format'
5
+
6
+ module LWAC
7
+
8
+ module OutputFilter
9
+
10
+ # -----------------------------------------------------------------------------
11
+ # Loads filters from the config file, in the following format:
12
+ # {:level => {:filter_name => "expression", :name => "expr", :name => "expr"},
13
+ # :level => {...}
14
+ # }
15
+ #
16
+ # Where :level describes one of the filtering levels supported by the export
17
+ # script:
18
+ # :server --- All data from a server's download process (mainly summary stats)
19
+ # :sample --- Data for a given sample (cross-sect)
20
+ # :datapoint --- Data for a given link
21
+ #
22
+ # Filter names are arbitrary identifiers for your referernce.
23
+ #
24
+ # Expressions can refer to any properties of the resource they use, or any
25
+ # resources from higher levels, for example, sample levels can refer to sample.id,
26
+ # but not datapoint.id.
27
+ #
28
+ def self.compile_filters( filters )
29
+ filters.each{|level, fs|
30
+ $log.info "Compiling #{level}-level filters..."
31
+
32
+ if(fs) then
33
+ fs.each{|f, v|
34
+ $log.info " Preparing filter #{f}..."
35
+ v = {:expr => v, :lambda => nil}
36
+
37
+ $log.debug "Building expression for filter (#{f})..."
38
+ begin
39
+ v[:lambda] = eval("lambda{|data|" + v[:expr] + "}")
40
+ rescue StandardError => e
41
+ $log.fatal "Error building expression for field: #{f}."
42
+ $log.fatal "Please review your configuration."
43
+ $log.fatal "The exact error was: \n#{e}"
44
+ $log.fatal "Backtrace: \n#{e.backtrace.join("\n")}"
45
+ exit(1)
46
+ end
47
+ $log.debug "Success so far..."
48
+
49
+ # pop back into original list
50
+ fs[f] = v
51
+ }
52
+ end
53
+
54
+ filters[level] = fs
55
+ $log.info "Done."
56
+ }
57
+ end
58
+
59
+
60
+
61
+
62
+
63
+ # -----------------------------------------------------------------------------
64
+ # Runs filters for a given level
65
+ def self.filter( data, filters )
66
+ return true if not filters # Accept if no constraints given
67
+
68
+ $log.debug "Filtering line..."
69
+ # Run all constraints, fail fast
70
+ filters.each{|f, v|
71
+ if not v[:lambda].call(data)
72
+ $log.debug "Rejecting due to filter: #{f}"
73
+ return false
74
+ end
75
+ }
76
+
77
+ # We got this far, accept!
78
+ $log.debug "Accepting."
79
+ return true
80
+
81
+ rescue StandardError => e
82
+ $log.fatal "Error filtering data: #{e}"
83
+ $log.fatal "This is probably a bug in your filtering expressions."
84
+ $log.fatal "Current state: filtering #{f}." if defined? f
85
+ $log.fatal "Backtrace: \n#{e.backtrace.join("\n")}"
86
+ exit(1)
87
+ end
88
+
89
+ end
90
+
91
+
92
+
93
+
94
+
95
+
96
+
97
+ class Exporter
98
+
99
+ # Points at the various formatter objects available
100
+ AVAILABLE_FORMATTERS = {
101
+ :csv => CSVFormatter,
102
+ :multicsv => MultiCSVFormatter,
103
+ :json => JSONFormatter,
104
+ :multitemplate => MultiTemplateFormatter,
105
+ :multixml => MultiXMLFormatter
106
+ }
107
+
108
+ def initialize(config)
109
+ @config = config
110
+
111
+ # Create a new formatter
112
+ @formatter = AVAILABLE_FORMATTERS[@config[:output][:formatter]].new( @config[:output][:formatter_opts] )
113
+
114
+ prepare_filters
115
+
116
+ load_server_config
117
+
118
+ load_storage_resources
119
+
120
+ validate_samples
121
+
122
+
123
+ summarise
124
+ end
125
+
126
+ # Export according to config
127
+ def export
128
+ # -----------------------------------------------------------------------------
129
+ # At this point we have a list of samples that are valid
130
+ # We should now probably do something with them :-)
131
+ # They all go in the structure below
132
+ data = Resource.new(Data, {:server => nil, :sample => nil, :datapoint => nil})
133
+
134
+ # Fire up some accounting variables
135
+ count = 0
136
+ progress = [count, Time.now]
137
+
138
+
139
+ # Open the output system
140
+ $log.debug "Opening formatter for writing..."
141
+ @formatter.open_output
142
+
143
+ # Write headers
144
+ if @config[:output][:headers]
145
+ $log.debug "Writing headers (line #{count+=1}/#{@estimated_lines})."
146
+ @formatter.write_header
147
+ progress = announce(count, progress, @estimated_lines, @config[:output][:announce])
148
+ end
149
+
150
+
151
+ # -----------------------------------------------------------------------------
152
+ # Construct the server (static) resource
153
+ $log.debug "Constructing server resource..."
154
+ server = {:links => @storage.read_link_ids.to_a,
155
+ :complete_sample_count => @available_samples.length,
156
+ :complete_samples => @available_samples.map{|as| as.id},
157
+ :next_sample_date => @storage.state.next_sample_due,
158
+ :current_sample_id => @storage.state.current_sample.id,
159
+ :config => @server_config,
160
+ :version => @storage.state.version
161
+ }
162
+ data.server = Resource.new("server", server)
163
+ #puts server.describe
164
+
165
+
166
+
167
+ # If we wish to output at the server level, do so.
168
+ if(@config[:output][:level] == :server) then
169
+ # output at server level
170
+ $log.debug "Writing output at server level (line #{count+=1}/#{@estimated_lines})."
171
+ @formatter << data
172
+ progress = announce(count, progress, @estimated_lines, @config[:output][:announce])
173
+ #.values
174
+ else
175
+ # ...continue to sample at a lower level
176
+ # -----------------------------------------------------------------------------
177
+ # One level deep, loop through samples and construct their resource
178
+ $log.debug "Constructing sample resources..."
179
+ @available_samples.each{|as|
180
+ sample = {:id => as.id,
181
+ :start_time => as.sample_start_time,
182
+ :end_time => as.sample_end_time,
183
+ :complete => as.complete?,
184
+ :open => as.open?,
185
+ :size => as.size,
186
+ :duration => (as.sample_end_time && as.sample_start_time) ? as.sample_end_time - as.sample_start_time : 0,
187
+ :start_time_s => as.sample_start_time.to_i,
188
+ :end_time_s => as.sample_end_time.to_i,
189
+ # :num_pending_links => as.pending.length,
190
+ # Either form takes way too long to compute on large servers
191
+ # :pending_links => data.server.links - (data.server.links.clone.delete_if{|x| x > as.last_dp_id} - as.pending.to_a),
192
+ # :pending_links => data.server.links.clone.to_a.delete_if{|id| (not as.pending.to_a.include?(id)) or (id > as.last_dp_id) },
193
+ :size_on_disk => as.approx_filesize,
194
+ :last_contiguous_id => as.last_dp_id,
195
+ :dir => @storage.get_sample_filepath(as.id),
196
+ :path => File.join(@storage.get_sample_filepath(as.id), @server_config[:storage][:sample_filename])
197
+ }
198
+ data.sample = Resource.new("sample", sample)
199
+ # puts data.describe
200
+
201
+
202
+
203
+
204
+ # If this sample is filtered out, ignore it regardless of sampling level
205
+ if(OutputFilter::filter(data, @config[:output][:filters][:sample])) then
206
+ # If we wish to sample at the sample level, do so
207
+ if(@config[:output][:level] == :sample) then
208
+ # output at server level
209
+ $log.debug "Writing output at sample level (line #{count+=1}/#{@estimated_lines})."
210
+ @formatter << data
211
+ else
212
+ # ...continue and build more info
213
+ # -----------------------------------------------------------------------------
214
+ # Two levels deep, loop through datapoints and construct their resources.
215
+ $log.debug "Constructing datapoint resources..."
216
+ data.server.links.each{|link_id|
217
+ # Load from disk
218
+ dp = @storage.read_datapoint( link_id, as )
219
+
220
+ datapoint = {:id => dp.link.id || "",
221
+ :uri => dp.link.uri || "",
222
+ :dir => File.dirname(@storage.get_dp_filepath(link_id, data.sample.id)),
223
+ :path => @storage.get_dp_filepath(link_id, data.sample.id),
224
+ :client_id => dp.client_id || "",
225
+ :error => dp.error || "",
226
+ :headers => dp.headers || {},
227
+ :head => dp.head || "",
228
+ :body => dp.body || "",
229
+ :response => dp.response_properties || {}
230
+ }
231
+
232
+ data.datapoint = Resource.new("datapoint", datapoint)
233
+ # puts data.describe
234
+
235
+
236
+ # Filter out individual datapoints if necessary
237
+ if(OutputFilter::filter(data, @config[:output][:filters][:datapoint])) then
238
+ # At this point we are at the finest-grained output possible, so
239
+ # just output!
240
+ $log.debug "Writing output at datapoint level (line #{count+=1}/#{@estimated_lines})."
241
+ @formatter << data
242
+ progress = announce(count, progress, @estimated_lines, @config[:output][:announce] )
243
+ else
244
+ @estimated_lines -= 1
245
+ $log.debug "Discarded datapoint #{data.datapoint.id} due to filter (revised estimate: #{@estimated_lines} lines)."
246
+ end
247
+ } # end per-datapoint loop
248
+ end # end sample if
249
+
250
+
251
+ else # else filter out this sample
252
+ @estimated_lines -= data.sample.size
253
+ $log.debug "Discarded sample #{data.sample.id} due to filter (revised estimate: #{@estimated_lines} lines)."
254
+ end # end filter IF
255
+
256
+
257
+ } # end per-sample loop
258
+ end # end server if
259
+
260
+ @formatter.close_output
261
+ $log.info "Done."
262
+ end
263
+
264
+ private
265
+
266
+
267
+ # -----------------------------------------------------------------------------
268
+ # Describe progress through the sample
269
+ def announce(count, progress, estimated_lines, period)
270
+ return progress if(count % period) != 0
271
+
272
+ # Extract stuff from the progress info
273
+ last_count, time = progress
274
+
275
+ # Compute estimated links remaining
276
+ links_remaining = estimated_lines - count
277
+ # Compute time per link since last time
278
+ time_per_link = (Time.now - time).to_f/(count - last_count).to_f
279
+ # Compute percentage
280
+ percentage = ((count.to_f / estimated_lines) * 100).round(2)
281
+
282
+ $log.info "#{count}/#{estimated_lines} (#{percentage}%) complete at #{(1.0/time_per_link).round(2)}/s ETA: #{Time.now + (time_per_link * links_remaining)}"
283
+
284
+ # Return a new progress list
285
+ return [count, Time.now]
286
+ end
287
+
288
+
289
+ # Load server configuration file into ram
290
+ def load_server_config
291
+ # Attempt to load server config
292
+ if not File.exist?(@config[:server_config]) then
293
+ $log.fatal "Server config file does not exist at #{@config[:server_config]}"
294
+ exit(1)
295
+ end
296
+ @server_config = YAML.load_file( File.open(@config[:server_config]) )
297
+ end
298
+
299
+ # Start up the two storage managers to inform us of the progress made
300
+ def load_storage_resources
301
+ @storage = StorageManager.new(@server_config[:storage])
302
+ @state = @storage.state
303
+
304
+ # -----------------------------------------------------------------------------
305
+ # Print handy messages to people
306
+ $log.warn "No samples have completed yet, this is a new deployment." if(@state.last_sample_id == -1)
307
+ $log.info "Current sample: #{@state.current_sample}."
308
+
309
+ cs = @state.current_sample
310
+ $log.info "The latest sample we can export in full is #{(cs.open? or not cs.complete?) ? @state.last_sample_id : @state.current_sample.id}"
311
+ end
312
+
313
+ # Attempt to account for samples
314
+ def validate_samples
315
+ @available_samples = []
316
+ available_sample_ids = (0..(@state.current_sample.id)).to_a
317
+ available_sample_ids.each{|sample_id|
318
+ begin
319
+ # Ensure the sample has all its files
320
+ @storage.validate_sample(sample_id)
321
+
322
+ # Load the sample metadata
323
+ sample = @storage.read_sample(sample_id)
324
+
325
+ # check it's closed and complete
326
+ raise "sample is open" if sample.open?
327
+ raise "sample is incomplete" if not sample.complete?
328
+
329
+ # Pop in the "valid" list.
330
+ @available_samples << sample
331
+ rescue StandardError => e
332
+ $log.warn "Problem reading sample #{sample_id}: #{e.to_s}"
333
+ $log.debug e.backtrace.join("\n")
334
+ end
335
+ }
336
+ $log.info "Opened #{@available_samples.length} samples successfully."
337
+ $log.debug "Samples: #{@available_samples.join(", ")}"
338
+
339
+ end
340
+
341
+ # Check and compile filters
342
+ def prepare_filters
343
+ @config[:output][:filters] = {} if not @config[:output][:filters]
344
+ OutputFilter::compile_filters( @config[:output][:filters] )
345
+ end
346
+
347
+ # Estimate the time this is going to take and print to sc and print to screenn
348
+ def summarise
349
+
350
+ $log.info "Sampling at the #{@config[:output][:level].to_s} level."
351
+ @estimated_lines = 0
352
+ @estimated_lines = @available_samples.length if(@config[:output][:level] == :sample)
353
+ @estimated_lines = @available_samples.length * @storage.read_link_ids.length if(@config[:output][:level] == :datapoint)
354
+ $log.info "Estimated output actions: #{@estimated_lines}"
355
+
356
+
357
+ end
358
+
359
+
360
+ end
361
+
362
+ end