lwac 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +70 -0
  3. data/README.md +31 -0
  4. data/bin/lwac +132 -0
  5. data/client_config.md +71 -0
  6. data/concepts.md +70 -0
  7. data/config_docs.md +40 -0
  8. data/doc/compile.rb +52 -0
  9. data/doc/template.rhtml +145 -0
  10. data/example_config/client.jv.yml +33 -0
  11. data/example_config/client.yml +34 -0
  12. data/example_config/export.yml +70 -0
  13. data/example_config/import.yml +19 -0
  14. data/example_config/server.yml +97 -0
  15. data/export_config.md +448 -0
  16. data/import_config.md +29 -0
  17. data/index.md +49 -0
  18. data/install.md +29 -0
  19. data/lib/lwac.rb +17 -0
  20. data/lib/lwac/client.rb +354 -0
  21. data/lib/lwac/client/file_cache.rb +160 -0
  22. data/lib/lwac/client/storage.rb +69 -0
  23. data/lib/lwac/export.rb +362 -0
  24. data/lib/lwac/export/format.rb +310 -0
  25. data/lib/lwac/export/key_value_format.rb +132 -0
  26. data/lib/lwac/export/resources.rb +82 -0
  27. data/lib/lwac/import.rb +152 -0
  28. data/lib/lwac/server.rb +294 -0
  29. data/lib/lwac/server/consistency_manager.rb +265 -0
  30. data/lib/lwac/server/db_conn.rb +376 -0
  31. data/lib/lwac/server/storage_manager.rb +290 -0
  32. data/lib/lwac/shared/data_types.rb +283 -0
  33. data/lib/lwac/shared/identity.rb +44 -0
  34. data/lib/lwac/shared/launch_tools.rb +87 -0
  35. data/lib/lwac/shared/multilog.rb +158 -0
  36. data/lib/lwac/shared/serialiser.rb +86 -0
  37. data/limits.md +114 -0
  38. data/log_config.md +30 -0
  39. data/monitoring.md +13 -0
  40. data/resources/schemata/mysql/links.sql +7 -0
  41. data/resources/schemata/sqlite/links.sql +5 -0
  42. data/server_config.md +242 -0
  43. data/tools.md +89 -0
  44. data/workflows.md +39 -0
  45. metadata +140 -0
@@ -0,0 +1,283 @@
1
+ # Sets are used to hold links in an unordered, non-duplicated fashion.
2
+ require 'set'
3
+
4
+
5
+
6
+ module LWAC
7
+
8
+ # -----------------------------------------------------------------------------
9
+ # Holds a datapoint, which is the return value from querying a link
10
+ # Immutable.
11
+ class DataPoint
12
+ attr_reader :link, :headers, :head, :body, :response_properties, :client_id, :error
13
+
14
+ # Methods to extract from Curl::Easy
15
+ INTERESTING_CURL_EASY_METHODS = %w{
16
+ body_str
17
+ cacert
18
+ cert
19
+ cert_key
20
+ certtype
21
+ connect_time
22
+ connect_timeout
23
+ content_type
24
+ cookiefile
25
+ cookiejar
26
+ cookies
27
+ dns_cache_timeout
28
+ download_speed
29
+ downloaded_bytes
30
+ downloaded_content_length
31
+ enable_cookies?
32
+ encoding
33
+ fetch_file_time?
34
+ file_time
35
+ follow_location?
36
+ ftp_commands
37
+ ftp_entry_path
38
+ ftp_filemethod
39
+ ftp_response_timeout
40
+ header_in_body?
41
+ header_size
42
+ header_str
43
+ headers
44
+ http_auth_types
45
+ http_connect_code
46
+ ignore_content_length?
47
+ interface
48
+ last_effective_url
49
+ local_port
50
+ local_port_range
51
+ low_speed_limit
52
+ low_speed_time
53
+ max_redirects
54
+ multipart_form_post?
55
+ name_lookup_time
56
+ num_connects
57
+ os_errno
58
+ password
59
+ post_body
60
+ pre_transfer_time
61
+ primary_ip
62
+ proxy_auth_types
63
+ proxy_port
64
+ proxy_tunnel?
65
+ proxy_type
66
+ proxy_url
67
+ proxypwd
68
+ redirect_count
69
+ redirect_time
70
+ redirect_url
71
+ request_size
72
+ resolve_mode
73
+ response_code
74
+ ssl_verify_host
75
+ ssl_verify_peer?
76
+ ssl_version
77
+ start_transfer_time
78
+ status
79
+ timeout
80
+ total_time
81
+ unrestricted_auth?
82
+ upload_speed
83
+ uploaded_bytes
84
+ uploaded_content_length
85
+ url
86
+ use_netrc?
87
+ use_ssl
88
+ useragent
89
+ username
90
+ userpwd
91
+ verbose?
92
+ }
93
+
94
+
95
+ def initialize(link, headers, head, body, response_properties, client_id, error=nil)
96
+ @link = link
97
+ @headers = headers
98
+ @headers = DataPoint.headers_to_hash(@headers) if not @headers.is_a?(Hash)
99
+ @head = head
100
+ @body = body
101
+ @response_properties = response_properties
102
+ @error = error
103
+ @client_id = client_id
104
+ end
105
+
106
+ def to_s
107
+ "<DataPoint #{@link.to_s}>"
108
+ end
109
+
110
+ # Turns HTTP headers into a ruby hash, by parsing
111
+ # them as a string
112
+ def self.headers_to_hash(header_string)
113
+ headers = {}
114
+ header_string.each_line do |ln|
115
+ if ln.index(':')
116
+ key = ln[0..(ln.index(':') - 1)].strip
117
+ val = ln[(ln.index(':') + 1)..-1].strip
118
+ headers[key] = val
119
+ end
120
+ end
121
+ return headers
122
+ end
123
+
124
+ # Converts a Curl result and an originating link
125
+ # into a datapoint with a standard character encoding, etc.
126
+ def self.from_request(config, link, res, client_id, error)
127
+ # DataPoint.new(link, headers, head, body, response_properties, @client_id, nil)
128
+ require 'curl' #
129
+
130
+ # Fix encoding of head if required
131
+ $log.debug "Fixing header encoding..."
132
+ body = fix_encoding(res.body_str.to_s, config)
133
+
134
+ # Fix encoding of head if required
135
+ $log.debug "Fixing header encoding..."
136
+ head = fix_encoding(res.header_str.to_s, config)
137
+
138
+ # Generate a hash of headers
139
+ $log.debug "Parsing headers..."
140
+ header_hash = DataPoint.headers_to_hash(head)
141
+
142
+
143
+ # Per-regex MIME handling
144
+ $log.debug "Passing MIME filter in #{config[:mimes][:policy]} mode..."
145
+ allow_mime = (config[:mimes][:policy] == :blacklist)
146
+ encoding = header_hash["Content-Type"].to_s
147
+ config[:mimes][:list].each{|mime_rx|
148
+ if encoding.to_s =~ Regexp.new(mime_rx, config[:mimes][:ignore_case]) then
149
+ allow_mime = (config[:mimes][:policy] == :whitelist)
150
+ $log.debug "Link #{link.id} matched MIME regex #{mime_rx}"
151
+ end
152
+ }
153
+ body = '' unless allow_mime
154
+
155
+ # Load stuff out of response object.
156
+ $log.debug "Extracting #{INTERESTING_CURL_EASY_METHODS.length} details from result..."
157
+ response_properties = {}
158
+ INTERESTING_CURL_EASY_METHODS.map { |m| response_properties[m.to_sym] = res.send(m.to_sym) }
159
+ response_properties[:mime_allowed] = allow_mime
160
+
161
+ DataPoint.new(link,
162
+ header_hash,
163
+ head,
164
+ body,
165
+ response_properties,
166
+ client_id,
167
+ error
168
+ )
169
+ end
170
+
171
+ private
172
+
173
+ # On user request, set the string encoding to something and provide policy for its fixes
174
+ def self.fix_encoding(str, config)
175
+ return str if not config[:fix_encoding]
176
+ return str.encode(config[:target_encoding], config[:encoding_options])
177
+ end
178
+
179
+ end
180
+
181
+
182
+
183
+ # -----------------------------------------------------------------------------
184
+ # Holds a link. Immutable.
185
+ class Link
186
+ attr_reader :id, :uri
187
+
188
+ def initialize(id, uri)
189
+ @id = id
190
+ @uri = uri
191
+ end
192
+
193
+ def to_s
194
+ "<#{@id}|#{@uri}>"
195
+ end
196
+ end
197
+
198
+
199
+
200
+ # -----------------------------------------------------------------------------
201
+ # Holds all data on a given sample, which covers:
202
+ # * A list of Links that are in the sample
203
+ # * Sample start/end times
204
+ #
205
+ # Will throw errors if one tries to edit it whilst closed.
206
+ class Sample
207
+ attr_reader :id, :sample_start_time, :size, :progress
208
+ attr_accessor :sample_end_time, :last_dp_id, :pending, :approx_filesize
209
+
210
+ def initialize(id, size, start_id=0, pending_links=Set.new, permit_sampling=false, sample_start_time=Time.now)
211
+ @id = id
212
+
213
+ @size = size.to_i # Number of datapoints in sample (read from db)
214
+ @progress = 0 # How many links have been done in total
215
+
216
+ @pending = pending_links # links read from db non-contiguously and simply not used yet
217
+ @last_dp_id = start_id # Where to start reading next IDs
218
+
219
+ # cumulative filesize of all data in sample
220
+ @approx_filesize = 0
221
+
222
+ @permit_sampling = permit_sampling
223
+ @sample_start_time = sample_start_time
224
+ @sample_end_time = nil
225
+ end
226
+
227
+ # Start sampling.
228
+ def open_sample
229
+ @permit_sampling = true
230
+ @sample_start_time = Time.now
231
+ end
232
+
233
+ def close_sample
234
+ @permit_sampling = false
235
+ @sample_end_time = Time.now
236
+ end
237
+
238
+ # Has this sample got any links pending?
239
+ def complete?
240
+ @progress >= @size
241
+ end
242
+
243
+ # Has the sample been opened?
244
+ def open?
245
+ @permit_sampling
246
+ end
247
+
248
+ def link_complete(filesize)
249
+ @approx_filesize += (filesize || 0)
250
+ @progress += 1
251
+ end
252
+
253
+ def remaining
254
+ @size - @progress
255
+ end
256
+
257
+ # Nicer output
258
+ def to_s
259
+ "<Sample #{@id}, #{@progress}/#{@size} [#{open? ? "open":"closed"}, #{complete? ? "complete":"incomplete"}]>"
260
+ end
261
+ end
262
+
263
+
264
+
265
+ # -----------------------------------------------------------------------------
266
+ # Holds time-dependent parameters of the server, meaning:
267
+ # * The last sample that was completed, and its duration
268
+ # * The current sample in progress
269
+ # * The time when the next sample is due
270
+ class ServerState
271
+ attr_accessor :last_sample_id, :current_sample, :next_sample_due, :last_sample_duration
272
+ attr_reader :version
273
+
274
+ def initialize(version, last_sample_id=-1, current_sample=nil, next_sample_due=nil)
275
+ @version = version
276
+ @last_sample_id = last_sample_id
277
+ @current_sample = current_sample || Sample.new(-1, 0)
278
+ @next_sample_due = next_sample_due || Time.now
279
+ @last_sample_duration = last_sample_duration || 1
280
+ end
281
+ end
282
+
283
+ end
@@ -0,0 +1,44 @@
1
+ # Contains methods for reporting and testing versioning.
2
+ #
3
+ # Actual version numbers are held in lib/lwac.rb
4
+ module LWAC::Identity
5
+
6
+ # Versions that may be loaded by the storage manager
7
+ # If it ain't in this list, it ain't coming off disk into RAM.
8
+ COMPATIBLE_CORPUS_VERSIONS = [LWAC::VERSION]
9
+ COMPATIBLE_NETWORK_VERSIONS = [LWAC::VERSION]
10
+
11
+ # Print the author string?
12
+ POMPOUS_MODE = true
13
+
14
+ # Checks if a given version of a corpus is compatible
15
+ def self.storage_is_compatible?(ver)
16
+ COMPATIBLE_CORPUS_VERSIONS.include?(ver)
17
+ end
18
+
19
+ # Checks if a given version of a client is compatible
20
+ def self.network_is_compatible?(ver)
21
+ COMPATIBLE_NETWORK_VERSIONS.include?(ver)
22
+ end
23
+
24
+ # Present the version to the log
25
+ def self.announce_version
26
+ msgs = []
27
+ msgs << "LWAC v#{LWAC::VERSION} (#{LWAC::DATE})"
28
+
29
+ if POMPOUS_MODE
30
+ auth_string = "by #{LWAC::AUTHORS[0..3].map{|a| "#{a[:name]} <#{a[:contact]}>"}.join(", ")}"
31
+ auth_string += " and #{LWAC::AUTHORS.length - 5} more." if LWAC::AUTHORS.length > 5
32
+ msgs << auth_string
33
+ end
34
+
35
+ if $log
36
+ msgs.each{|m| $log.info(m) }
37
+ else
38
+ msgs.each{|m| puts m }
39
+ end
40
+ end
41
+ end
42
+
43
+
44
+
@@ -0,0 +1,87 @@
1
+ # -----------------------------------------------------------------------------
2
+ # These procedures are designed as helpers for launching the various utilities
3
+ # in LWAC. They cover:
4
+ # * Loading configs
5
+ # * Checking dependencies at runtime for helpful error output
6
+ # * Instantiating global log objects
7
+
8
+ require 'lwac/shared/multilog'
9
+
10
+ module LWAC
11
+
12
+ def self.print_usage
13
+ $stderr.puts "USAGE: #{$PROGRAM_NAME} TOOL CONFIG [IMPORT_FILE]"
14
+ $stderr.puts ""
15
+ $stderr.puts " TOOL : one of 'server', 'client', 'import' or 'export'"
16
+ $stderr.puts " CONFIG : A path to the config file for the tool"
17
+ $stderr.puts " IMPORT_FILE : A URL list to import"
18
+ $stderr.puts ""
19
+ end
20
+
21
+
22
+ # -----------------------------------------------------------------------------
23
+ # Load configs from ARGV[0] and output usage info.
24
+ def self.load_config
25
+
26
+ # First, check arguments are fine.
27
+ if ARGV.length < 2 or not File.readable?(ARGV[1]) then
28
+ print_usage()
29
+ exit(1)
30
+ end
31
+
32
+ # Check the tool is a valid one
33
+ if not %w{server client import export}.include?(ARGV[0]) then
34
+ $stderr.puts "Not a valid command: #{ARGV[0]}"
35
+ print_usage()
36
+ exit(1)
37
+ end
38
+
39
+ # Require things we need for the below
40
+ require 'yaml'
41
+ require 'logger'
42
+
43
+
44
+
45
+ # Then check filesystem is in shape
46
+ #require_relative ...
47
+
48
+
49
+ # Then load the config
50
+ tool = ARGV[0].to_sym
51
+ config = YAML.load_file(ARGV[1])
52
+
53
+
54
+
55
+ # Then, create global log
56
+ logdevs = []
57
+ if config[:logging] and config[:logging][:logs].is_a?(Hash)
58
+ config[:logging][:logs].each{|name, ldopts|
59
+ # Construct the log
60
+ ld = {:name => name}
61
+ ld[:dev] = %w{STDOUT STDERR}.include?(ldopts[:dev]) ? eval(ldopts[:dev]) : ldopts[:dev] || STDOUT
62
+ ld[:level] = ldopts[:level]
63
+
64
+ # Add to the list of logs
65
+ logdevs << ld
66
+ }
67
+ end
68
+ $log = MultiOutputLogger.new(logdevs, config[:logging][:progname].to_s)
69
+
70
+ # Apply nicer log output format
71
+ $log.formatter = proc do |severity, datetime, progname, msg|
72
+ "#{severity.to_s[0]} #{progname} [#{datetime.strftime('%y-%m-%d %H:%M:%S')}] #{msg}\n"
73
+ end
74
+
75
+
76
+ # Handle signals nicely.
77
+ $log.debug "Installing signal handlers..."
78
+ %w{INT HUP KILL ABRT}.each{|s|
79
+ trap(s) { raise SignalException.new(s) }
80
+ }
81
+
82
+
83
+ # Return the config we've loaded.
84
+ return tool, config
85
+ end
86
+
87
+ end
@@ -0,0 +1,158 @@
1
+ require 'logger'
2
+
3
+ module LWAC
4
+ # Add the ability to log to many devices, one for posterity and one for cron.
5
+ class MultiOutputLogger < Logger
6
+
7
+ # Default log level
8
+ DEFAULT_LEVEL = Logger::UNKNOWN
9
+
10
+ # Create a simple log object with one log level and one device
11
+ def initialize(logdevs = {}, progname=nil, shift_age = 0, shift_size = 1048576)
12
+ super(nil, shift_age, shift_size)
13
+ @progname = progname
14
+ @shift_age = shift_age
15
+ @shift_size = shift_size
16
+ @lowest_level = DEFAULT_LEVEL
17
+ configure_logs(logdevs)
18
+ end
19
+
20
+ def configure_logs(logdevs = {})
21
+ # Remove all exsiting logs
22
+ @logdevs.each{|name, ld| remove_log(name)} if @logdevs
23
+
24
+ # Parse logdevs hash options
25
+ @logdevs = {}
26
+ logdevs = [logdevs] if logdevs.class == Hash
27
+
28
+ # If the user provides a device then set up a single log as :log
29
+ if not logdevs.class == Array then
30
+ @logdevs[:default] = {:dev => logdevs, :level => DEFAULT_LEVEL}
31
+ @lowest_level = @logdevs[:default][:level]
32
+ return
33
+ end
34
+
35
+ # If the user provides a hash, check each arg
36
+ logdevs.each{|ld|
37
+ name = ld[:name] ||= :default
38
+ dev = ld[:dev] ||= $stdout
39
+ level = ld[:level] ||= DEFAULT_LEVEL
40
+ shift_age = ld[:shift_age] ||= @shift_age
41
+ shift_size = ld[:shift_size] ||= @shift_size
42
+ level = MultiOutputLogger.string_to_level(level) if level.class != Fixnum
43
+
44
+ # Add to the name deely.
45
+ add_log(name, dev, level, shift_age, shift_size)
46
+ }
47
+ end
48
+
49
+ # Add a log.
50
+ def add_log(name, destination, level, shift_age = 0, shift_size = 1048576)
51
+ dev = LogDevice.new(destination, :shift_age => shift_age, :shift_size => shift_size)
52
+
53
+ @logdevs[name] = {:dev => dev, :level => level}
54
+ @lowest_level = level if (not @lowest_level) or level < @lowest_level
55
+ end
56
+
57
+ # Stop logging to one of the logs
58
+ def remove_log(name)
59
+ if(@logdevs[name])
60
+ # Back up old level
61
+ old_level = @logdevs[name][:level]
62
+
63
+ # Remove
64
+ @logdevs.delete(name)
65
+
66
+ # Update lowest level if we need to
67
+ @lowest_level = @logdevs.values.map{|x| x[:level] }.min if old_level == @lowest_level
68
+ end
69
+ end
70
+
71
+ # Print a summary of log output devices
72
+ def summarise_logging
73
+ add(@lowest_level, "Summary of logs:")
74
+ if(@logdevs.length > 0)
75
+ c = 0
76
+ @logdevs.each{|name, ld|
77
+ msg = " (#{c+=1}/#{@logdevs.length}) #{name} (level: #{MultiOutputLogger.level_to_string(ld[:level])}, device: fd=#{ld[:dev].dev.fileno}#{ld[:dev].dev.tty? ? " TTY" : ""}#{ld[:dev].filename ? " filename=#{ld[:dev].filename}" : ""})"
78
+ add(@lowest_level, msg)
79
+ }
80
+ else
81
+ add(@lowest_level, " *** No logs!") # Amusingly, this can never output
82
+ end
83
+ end
84
+
85
+ # set the log level of one of the logs
86
+ def set_level(name, level=nil)
87
+ # Default
88
+ if not level then
89
+ level = name
90
+ name = nil
91
+ end
92
+
93
+ # Look up the level if the user provided a :symbol or "string"
94
+ level = MultiOutputLogger.string_to_level(level.to_s) if level.class != Fixnum
95
+
96
+ if name
97
+ # Set a specific one
98
+ raise "No log by the name '#{name}'" if not @logdevs[name]
99
+ @logdevs[name][:level] = level
100
+ else
101
+ # Set them all by default
102
+ @logdevs.each{|name, logdev| logdev[:level] = level }
103
+ end
104
+ end
105
+
106
+ # Returns the log level of a log
107
+ def get_level(name = nil)
108
+ name = :default if not name
109
+ return nil if not @logdevs[name]
110
+ return @logdevs[name][:level]
111
+ end
112
+
113
+ # Overrides the basic internal add in Logger
114
+ def add(severity, message = nil, progname = nil, &block)
115
+ severity ||= UNKNOWN
116
+
117
+ # give up if no logdevs or if too low a severity
118
+ return true if severity < @lowest_level or (not @logdevs.values.map{|ld| ld[:dev].nil?}.include?(false))
119
+
120
+ # Set progname to nil unless it is explicitly specified
121
+ progname ||= @progname
122
+ if message.nil?
123
+ if block_given?
124
+ message = yield
125
+ else
126
+ message = progname
127
+ progname = @progname
128
+ end
129
+ end
130
+
131
+ # Sync time across the logs and output only if above the log level for that device
132
+ msg = format_message(format_severity(severity), Time.now, progname, message)
133
+ @logdevs.each{ |name, ld|
134
+ ld[:dev].write(msg) if not ld[:dev].nil? and ld[:level] <= severity
135
+ }
136
+ return true
137
+ end
138
+
139
+ # convert a level to a string
140
+ def self.level_to_string(lvl)
141
+ labels = %w(DEBUG INFO WARN ERROR FATAL)
142
+ return labels[lvl] || "UNKNOWN"
143
+ end
144
+
145
+ # Convert a string to a logger level number
146
+ def self.string_to_level(str)
147
+ labels = %w(DEBUG INFO WARN ERROR FATAL)
148
+ return labels.index(str.to_s.upcase) || Logger::UNKNOWN
149
+ end
150
+
151
+ def close
152
+ @logdevs.each{|name, ld|
153
+ ld[:dev].close
154
+ }
155
+ end
156
+ end
157
+
158
+ end