lwac 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +70 -0
  3. data/README.md +31 -0
  4. data/bin/lwac +132 -0
  5. data/client_config.md +71 -0
  6. data/concepts.md +70 -0
  7. data/config_docs.md +40 -0
  8. data/doc/compile.rb +52 -0
  9. data/doc/template.rhtml +145 -0
  10. data/example_config/client.jv.yml +33 -0
  11. data/example_config/client.yml +34 -0
  12. data/example_config/export.yml +70 -0
  13. data/example_config/import.yml +19 -0
  14. data/example_config/server.yml +97 -0
  15. data/export_config.md +448 -0
  16. data/import_config.md +29 -0
  17. data/index.md +49 -0
  18. data/install.md +29 -0
  19. data/lib/lwac.rb +17 -0
  20. data/lib/lwac/client.rb +354 -0
  21. data/lib/lwac/client/file_cache.rb +160 -0
  22. data/lib/lwac/client/storage.rb +69 -0
  23. data/lib/lwac/export.rb +362 -0
  24. data/lib/lwac/export/format.rb +310 -0
  25. data/lib/lwac/export/key_value_format.rb +132 -0
  26. data/lib/lwac/export/resources.rb +82 -0
  27. data/lib/lwac/import.rb +152 -0
  28. data/lib/lwac/server.rb +294 -0
  29. data/lib/lwac/server/consistency_manager.rb +265 -0
  30. data/lib/lwac/server/db_conn.rb +376 -0
  31. data/lib/lwac/server/storage_manager.rb +290 -0
  32. data/lib/lwac/shared/data_types.rb +283 -0
  33. data/lib/lwac/shared/identity.rb +44 -0
  34. data/lib/lwac/shared/launch_tools.rb +87 -0
  35. data/lib/lwac/shared/multilog.rb +158 -0
  36. data/lib/lwac/shared/serialiser.rb +86 -0
  37. data/limits.md +114 -0
  38. data/log_config.md +30 -0
  39. data/monitoring.md +13 -0
  40. data/resources/schemata/mysql/links.sql +7 -0
  41. data/resources/schemata/sqlite/links.sql +5 -0
  42. data/server_config.md +242 -0
  43. data/tools.md +89 -0
  44. data/workflows.md +39 -0
  45. metadata +140 -0
@@ -0,0 +1,145 @@
1
+ <html>
2
+ <head>
3
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
4
+ <title><%= filename.to_s[0..-(File.extname(filename).to_s.length + 1)] %></title>
5
+
6
+ <style type="text/css">
7
+ body {
8
+ font-family: "Avenir Next", Helvetica, Arial, sans-serif;
9
+ padding:1em;
10
+ margin:auto;
11
+ // max-width:42em;
12
+ background:#fefefe;
13
+ }
14
+
15
+ div#toc ol {
16
+ list-style-type: none;
17
+ }
18
+
19
+ h1#header{
20
+ text-align: center;
21
+ font-weight: bold;
22
+ }
23
+
24
+
25
+ h2, h3, h4, h5, h6 {
26
+ font-weight: bold;
27
+ }
28
+
29
+ h1 {
30
+ font-weight: 70%;
31
+ color: #000000;
32
+ font-size: 26pt;
33
+ }
34
+
35
+ h2 {
36
+ border-bottom: 1px solid #CCCCCC;
37
+ color: #000000;
38
+ font-size: 24px;
39
+ }
40
+
41
+ h3 {
42
+ font-size: 18px;
43
+ }
44
+
45
+ h4 {
46
+ font-size: 16px;
47
+ }
48
+
49
+ h5 {
50
+ font-size: 14px;
51
+ }
52
+
53
+ h6 {
54
+ color: #777777;
55
+ background-color: inherit;
56
+ font-size: 14px;
57
+ }
58
+
59
+ hr {
60
+ height: 0.2em;
61
+ border: 0;
62
+ color: #CCCCCC;
63
+ background-color: #CCCCCC;
64
+ }
65
+
66
+ p, blockquote, ul, ol, dl, li, table, pre {
67
+ margin: 15px 0;
68
+ }
69
+
70
+ a, a:visited {
71
+ color: #4183C4;
72
+ background-color: inherit;
73
+ text-decoration: none;
74
+ }
75
+
76
+ #message {
77
+ border-radius: 6px;
78
+ border: 1px solid #ccc;
79
+ display:block;
80
+ width:100%;
81
+ height:60px;
82
+ margin:6px 0px;
83
+ }
84
+
85
+ button, #ws {
86
+ font-size: 10pt;
87
+ padding: 4px 6px;
88
+ border-radius: 5px;
89
+ border: 1px solid #bbb;
90
+ background-color: #eee;
91
+ }
92
+
93
+ code, pre, #ws, #message {
94
+ font-size: 10pt;
95
+ border-radius: 3px;
96
+ background-color: #F8F8F8;
97
+ color: inherit;
98
+ }
99
+
100
+ code {
101
+ border: 1px solid #EAEAEA;
102
+ margin: 0 2px;
103
+ padding: 0 5px;
104
+ }
105
+
106
+ pre {
107
+ border: 1px solid #CCCCCC;
108
+ overflow: auto;
109
+ padding: 4px 8px;
110
+ }
111
+
112
+ pre > code {
113
+ border: 0;
114
+ margin: 0;
115
+ padding: 0;
116
+ }
117
+
118
+ #ws { background-color: #f8f8f8; }
119
+
120
+ .send { color:#77bb77; }
121
+ .server { color:#7799bb; }
122
+ .error { color:#AA0000; }
123
+
124
+ .bold{
125
+ font-weight: bold;
126
+ }
127
+
128
+ </style>
129
+ </head>
130
+ <body>
131
+
132
+ <h1 id="header">LWAC <%=version%> User Guide</h1>
133
+
134
+ <div id="nav">
135
+ <div id="menu" style="border-radius: 3px; margin: 0 auto; font-size: smaller; background-color: #ddd; text-align: center; padding: 0.2em 0.4em;">
136
+ <%= pages.sort.map{|p| "<a href=\"#{p}.html\" " + ((p == filename.to_s[0..-(File.extname(filename).to_s.length + 1)]) ? 'style="color: black;"' : '') + ">#{p}</a>"}.join("&nbsp;|&nbsp;") %>
137
+ </div>
138
+ </div>
139
+ <%=content%>
140
+
141
+ <div id="footer" style="border-radius: 3px; margin: 0 auto; font-size: smaller; background-color: #ddd; text-align: center; padding: 0.2em 0.4em;">
142
+ Generated on <%=Time.now%>
143
+ </div>
144
+ </body>
145
+ </html>
@@ -0,0 +1,33 @@
1
+ ---
2
+ :server: # Options affecting which server to use
3
+ :address: "148.88.227.135"
4
+ :port: 27400 # Port
5
+ :serialiser: :marshal # :marshal, :json, :msgpack, or :yaml. :marshal highly recommended.
6
+ :network: # Network behaviour when talking to the server
7
+ :connect_timeout: 20 # How long we give the socket to respond
8
+ :minimum_reconnect_time: 1 # The minimum time we take before trying again
9
+ :maximum_reconnect_time: 240 # The maximum time we take before trying again
10
+ :connect_failure_penalty: 3 # The amount we wait extra each time it fails.
11
+ :client: # Properties of the client
12
+ :announce_progress: true
13
+ :monitor_rate: 0.5 # Check download progress every n seconds (<=1 recommended)
14
+ :uuid_salt: "JV" # What to call ones'self in logs. Will be prefixed to a hash computed from the hostname.
15
+ :batch_capacity: 5000 # How many links to download at once? Careful not to run out of RAM or take too long.
16
+ :cache_limit: 400 # How large a client cache should be. At most two will be in memory at once.
17
+ :check_in_size: 400 # How large chunks should be when checking in in MB.
18
+ :strict_cache_limit: true # Prevents workers from adding to the cache if it grows beyond check_in_size. May slow down progress depending on batch size, but enforces memory limits more effectively
19
+ :simultaneous_workers: 200 # Simultaneous workers bigger is more parallel.
20
+ :cache_dir: # Set to nil to use ram
21
+ # :cache_file: /tmp/ # Set to nil to use ram
22
+ :logging: # Log output options
23
+ :progname: Client # Name used in logs
24
+ :logs: # List of log outputs
25
+ :default: # Log is called "default"
26
+ :dev: STDOUT # Filename to log to, or "STDOUT", or "STDERR"
27
+ :level: :info # Level to report at. One of :debug, :warn, :info, :error, or :fatal
28
+ #:errors:
29
+ #:dev: 'logs/client.err'
30
+ #:level: :warn
31
+ :file_log:
32
+ :dev: 'logs/client.log'
33
+ :level: :info
@@ -0,0 +1,34 @@
1
+ ---
2
+ :server: # Options affecting which server to use
3
+ :hostname: localhost # IP or hostname
4
+ :port: 27401 # Port
5
+ :password: lwacpass
6
+ :secret: grnionfgn89540ng8953n8g0n54890gn90345ng95noe4ig54nio # encryption key
7
+ :network: # Network behaviour when talking to the server
8
+ :connect_timeout: 20 # How long we give the socket to respond
9
+ :minimum_reconnect_time: 1 # The minimum time we take before trying again
10
+ :maximum_reconnect_time: 240 # The maximum time we take before trying again
11
+ :connect_failure_penalty: 3 # The amount we wait extra each time it fails.
12
+ :client: # Properties of the client
13
+ :announce_progress: true
14
+ :monitor_rate: 0.5 # Check download progress every n seconds (<=1 recommended)
15
+ :uuid_salt: "LOCAL" # What to call ones'self in logs. Will be prefixed to a hash computed from the hostname.
16
+ :batch_capacity: 10000 # How many links to download at once? Careful not to run out of RAM or take too long.
17
+ :cache_limit: 209715200 # How large a client cache should be. At most two will be in memory at once.
18
+ :check_in_size: 209715200 # How large chunks should be when checking in in MB.
19
+ :strict_cache_limit: true # Prevents workers from adding to the cache if it grows beyond check_in_size. May slow down progress depending on batch size, but enforces memory limits more effectively
20
+ :simultaneous_workers: 500 # Simultaneous workers bigger is more parallel.
21
+ :cache_dir: # Set to nil to use ram
22
+ # :cache_file: /tmp/ # Set to nil to use ram
23
+ :logging: # Log output options
24
+ :progname: Client # Name used in logs
25
+ :logs: # List of log outputs
26
+ :default: # Log is called "default"
27
+ :dev: STDOUT # Filename to log to, or "STDOUT", or "STDERR"
28
+ :level: :info # Level to report at. One of :debug, :warn, :info, :error, or :fatal
29
+ #:errors:
30
+ #:dev: 'logs/client.err'
31
+ #:level: :warn
32
+ :file_log:
33
+ :dev: 'logs/client.log'
34
+ :level: :info
@@ -0,0 +1,70 @@
1
+ ---
2
+ :server_config: example_config/server.yml # Export reads the storage config from a server. Point this at the server config.
3
+ :output: # Output config, which is pretty much everything
4
+ :announce: 2000 # Every nth line, update the UI
5
+
6
+ :formatter: :multitemplate # :csv for single-file csv output, :multicsv for one-csv-per-point, :json for json file, :multitemplate for one-erb-per-point
7
+ :formatter_opts:
8
+ :filename: exported_data/sample_#{data.sample.id}/#{data.datapoint.id}.html # Write to this file or directory, depending on exporter system
9
+ # :filename: exported_data/sample_#{data.sample.id}/#{data.datapoint.id}.csv # Write to this file or directory, depending on exporter system
10
+ # :filename: export.csv # Multicsv and template option. Select filename from one of the keys in the line
11
+ :template: example_config/export_template_html_sampler.erb
12
+ :xml_format: :default # :pretty, :whitespace or :default
13
+ :xml_indent: 8 # How many spaces to indent for :pretty or :whitespace
14
+ :csv_opts:
15
+ :fields: # Formatters define how data is output to CSV, once selected.
16
+ :sample_id: sample.id # Make the 'sample_id' field contain the sample.id variable.
17
+ :link_id: datapoint.id # link_id will contain the datapoint.id variable, etc...
18
+ :link_uri: datapoint.uri
19
+ :redirected: # 'redirected' field will include the output of the expression below
20
+ :expr: "return (data.datapoint.response.effective_uri and data.datapoint.uri.chomp('/') == data.datapoint.response.effective_uri.chomp('/'))"
21
+ :dns_time: datapoint.response.dns_lookup_time
22
+ :redirect_time: # 'redirect_time' will contain the variable datapoint.response.redirect_time only if condition evaluates to true, and "" otherwise
23
+ :var: datapoint.response.redirect_time
24
+ :condition: "(x and x.to_f > 0)"
25
+ :missing: ""
26
+ :rtt: datapoint.response.round_trip_time
27
+ :response_code: datapoint.response.code
28
+ :imperfect:
29
+ :expr: "data.datapoint.response.code.to_i == 200"
30
+ :redirect_proportion: # A long expression on multiple lines
31
+ :expr: >
32
+ r = data.datapoint.response
33
+
34
+ if(r.redirect_time and r.redirect_time > 0) then
35
+ return r.redirect_time.to_f / r.round_trip_time.to_f
36
+ else
37
+ return "NA"
38
+ end
39
+ :sample_file: sample.path
40
+ :sample_dir: sample.dir
41
+ :datapoint_dir: datapoint.dir
42
+ :datapoint_path: datapoint.path
43
+ :last_id: sample.last_contiguous_id
44
+ # :raw: datapoint.body # Body content
45
+ #:l2: datapoint.id
46
+ :headers: true # Output CSV headers?
47
+ :level: :datapoint # output one line per :server, :sample or :datapoint
48
+ # note that if you select :server, all the
49
+ # sample and datapoint vars will be nil
50
+ :filters: # Filters define the selection of data to output
51
+ :server: # server level filters
52
+ :sample: # sample level filters
53
+ # #:test_filter: data.sample.id > 1 and data.sample.id < 3
54
+ # :test_filter2: data.sample.id == 1
55
+ :datapoint: # datapoint level filters
56
+ # :test_filter3: data.datapoint.id > 10
57
+ :logging: # Log output settings
58
+ :progname: Export # Name of the app in the logs
59
+ :logs: # List of logs
60
+ :default: # Log name 'default'
61
+ :dev: STDOUT # File to log to, or 'STDOUT', or 'STDERR'
62
+ :level: :info # Level to log. One of :debug, :warn, :info, :error, or :fatal
63
+ #:errors:
64
+ #:dev: 'logs/server.err'
65
+ #:level: :warn
66
+ :file_log:
67
+ :dev: 'logs/export.log'
68
+ :level: :info
69
+
70
+
@@ -0,0 +1,19 @@
1
+ ---
2
+ :server_config: example_config/server.yml # Import reads the storage config from a server. Point this at the server config.
3
+ :schemata_path: # Where to find .sql files for use as a schema. Leave blank for auto/default schemata
4
+ #:schemata_path: ./resources/schemata # Where to find .sql files for use as a schema
5
+ :notify: 12345 # Update the UI once per n links
6
+ :create_db: true # Create the db if it doesn't already exist?
7
+ :logging: # Logging and output options
8
+ :progname: Import # What to call the app in logs
9
+ :logs: # A list of log outputs
10
+ :default: # Call this log "default"
11
+ :dev: STDOUT # Where to log. Give a filename, or use 'STDOUT' or 'STDERR'
12
+ :level: :debug # Level to log. One of :debug, :warn, :info, :error, or :fatal
13
+ #:errors:
14
+ #:dev: 'logs/server.err'
15
+ #:level: :warn
16
+ :file_log:
17
+ :dev: 'logs/server.log'
18
+ :level: :info
19
+
@@ -0,0 +1,97 @@
1
+ ---
2
+ :storage: # Options affecting the data storage engine
3
+ :root: corpus # A path to the corpus directory (root/)
4
+ :state_file: state # What to call the file where server state is stored (root/state_file)
5
+ :sample_subdir: samples # Where to keep samples (root/sample_subdir)
6
+ :sample_filename: sample # What to call each file (root/sample_subdir/id/sample.yml
7
+ :files_per_dir: 1000 # How many files to store in each directory (root/sample_subdir/id/XX/)
8
+ :serialiser: :marshal # :marshal, :json or :yaml. :marshal is faster but :yaml can be read by other languages [:msgpack is pending]
9
+ :database: # Config options affecting the metadata database
10
+ :engine: :sqlite # :sqlite or :mysql
11
+ :engine_conf:
12
+ :filename: corpus/links.db # Name of the database, relative to corpus root (root/filename)
13
+ :transaction_limit: 100 # How many requests per transaction. Larger numbers yield a speed boost at the expense of data security and memory use.
14
+ :pragma: # Custom pragmas. See SQLite's docs. Default settings yield speed and fair data security
15
+ "locking_mode": "EXCLUSIVE" # Do not allow others to access the db when the server is running
16
+ "cache_size": 20000 # Allow a large cache
17
+ "synchronous": 0 # Asynchronous operations speed things up a lot
18
+ "temp_store": 2 # Use temp storage
19
+ # :engine_conf: # Options from https://github.com/brianmario/mysql2
20
+ # :username: lwac
21
+ # :password: lwacpass
22
+ # # :host: localhost
23
+ # # :port: 3345
24
+ # :socket: /var/run/mysqld/mysqld.sock
25
+ # :database: lwac
26
+ # :encoding: 'utf8'
27
+ # :read_timeout: 10 #seconds,
28
+ # :write_timeout: 10 #seconds,
29
+ # :connect_timeout: 10 #seconds,
30
+ # :reconnect: true #/false,
31
+ # :local_infile: true #/false,
32
+ :table: links # The name of the links table within the sqlite db
33
+ :fields: # Fields for the link table:
34
+ :id: id # The ID field is called 'id' by default
35
+ :uri: uri # The field with the URIs in it is called 'uri' by default
36
+ :sampling_policy: # Defines sampling times
37
+ :sample_limit: # Stop at the nth sample (or refuse to sample if n is already over this!) Set to nil to disable
38
+ :sample_time: 60 #43200 # Sample every n seconds (i.e. 3600 for hourly, 86400 for daily, etc)
39
+ :sample_alignment: 0 #28800 # Sample alignment. Set to 0 for sampling 'on the hour'/'midnight', etc
40
+ :client_policy:
41
+ :dry_run: false # Don't actually do any downloading if set to true
42
+ # ----
43
+ :fix_encoding: true # Attempt to fix the encoding of output?
44
+ :target_encoding: UTF-8 # Target encoding
45
+ :encoding_options: # options supported see http://ruby-doc.org/core-1.9.3/String.html#method-i-encode
46
+ :invalid: :replace # If value is :replace , replaces invalid chars with the :replace char.
47
+ :undef: :replace # if value if :replace , replaces undefined chars with the :replace char
48
+ #:replace: '?' # the char to use in replacement, defaults to uFFFD for unicode and '?' for other targets
49
+ #:fallback: # some object supporting [], to look up [invalid char in source encoding] = valid char in destination encoding
50
+ #'from': 'to'
51
+ #'from2': 'to2'
52
+ #:xml: :attr # either :text or :attr. If :text, replaces things with hex entities, if :attr, it also quotes the entities "&quot;"
53
+ #:cr_newline: true # Replaces LF(\n) with CR(\r) if true
54
+ #:crlf_newline: # Replaces LF(\n) with CRLF(\r\n) if true
55
+ :universal_newline: true # Replaces CRLF(\r\n) and CR(\r) with LF(\n) if true
56
+ # ----
57
+ :max_body_size: 20971520 # Stop downloading if over this number of bytes is downloaded, by default set to 20MB, 20971520
58
+ :mimes:
59
+ :policy: :whitelist # :whitelist to allow only those in list, :blacklist to deny only those in list
60
+ :ignore_case: true
61
+ :list: # List of mime types to allow/deny
62
+ - ^text\/?.*$ # text-only mimes
63
+ #- ^.+$ # anything with a valid content-type
64
+ :curl_workers: # Options affecting individual web request clients themselves
65
+ :max_redirects: 5 # How many redirects to follow. 5 seems standard in browsers
66
+ :useragent: ! '"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11"' # How to ID ones'self to the server
67
+ :enable_cookies: true
68
+ # :headers: "Header: String"
69
+ :verbose: false
70
+ :follow_location: true # Should it follow location headers?
71
+ :timeout: 20 # Overall timeout per request. Set to a user's attention span.
72
+ :connect_timeout: 10 # TCP connect timeout.
73
+ :dns_cache_timeout: 10 # DNS lookup timeout.
74
+ :ftp_response_timeout: 10 # FTP response timeout.
75
+ :client_management: # Management of downloader clients
76
+ :time_per_link: 5 # Allow n second[s] per link before timing out and resetting the client's assignments. Used when the server doesn't yet know how fast a client is.
77
+ :dynamic_time_overestimate: 1.3 # Allow n times the client's previous performance before timing out and resetting the client's assignments.
78
+ :empty_client_backoff: 60 # If we find no links waiting for download, tell the client to wait for this time before trying again
79
+ :delay_overestimate: 10 # Overestimate any sample policy delays by up to this amount. Helps avoid clock drift issues.
80
+ :server: # Network server properties
81
+ :hostname: # The interface to listen on
82
+ :port: 27401 # The port to listen on
83
+ :password: lwacpass # password
84
+ :secret: grnionfgn89540ng8953n8g0n54890gn90345ng95noe4ig54nio # encryption key
85
+ :logging: # Logging and output options
86
+ :progname: Server # What to call the app in logs
87
+ :logs: # A list of log outputs
88
+ :default: # Call this log "default"
89
+ :dev: STDOUT # Where to log. Give a filename, or use 'STDOUT' or 'STDERR'
90
+ :level: :debug # Level to log. One of :debug, :warn, :info, :error, or :fatal
91
+ #:errors:
92
+ #:dev: 'logs/server.err'
93
+ #:level: :warn
94
+ :file_log:
95
+ :dev: 'logs/server.log'
96
+ :level: :info
97
+
@@ -0,0 +1,448 @@
1
+ Export Tool Configuration
2
+ =========================
3
+ The export tool accesses the server's corpus and exports it according to a series of complex policies. As such its configuration file is very open-ended and may contain significant portions of ruby code, however, every effort has been made to make simple things simple.
4
+
5
+ Data Access
6
+ -----------
7
+ Many of the features of the export tool require an understanding of how data is structured within. Data are stored as a large tree, sorted into three main levels. Variables are accessed on the tree by using dot notation, as with any member variables in ruby (i.e. `data.sample.id` will get the ID of the current sample).
8
+
9
+ Note that the export tool will only allow exporting of complete samples, i.e. those where all the links have been downloaded and the sample closed. This allows it to be used whilst the server is still running.
10
+
11
+ The hierachy is currently as below:
12
+
13
+ * `data` --- Root object, containing each level as a member
14
+ * `.server` --- Server-level variables, including server state
15
+ * `.links` --- A list of links available to download
16
+ * `.complete_sample_count` --- How many samples have completely downloaded and have available data?
17
+ * `.complete_samples` --- A list containing the IDs of all the complete samples
18
+ * `.next_sample_date` --- The date of the next sample due
19
+ * `.current_sample_id` --- The ID of the current sample (the next, incomplete, one)
20
+ * `.version` --- The version of the server used to write the corpus
21
+ * `.config` --- The server configuration, as a hash
22
+ * `.sample` --- Sample-level variables
23
+ * `.id` --- The ID of the sample
24
+ * `.start_time` --- The time the sample started acquiring data
25
+ * `.end_time` --- The time the sample stopped and checked in the final link
26
+ * `.start_time_s` --- The start time in seconds from the UNIX epoch
27
+ * `.end_time_s` --- The end time in seconds from the UNIX epoch
28
+ * `.complete` --- Boolean. Is the sample complete?
29
+ * `.open` --- Boolean. Is the sample open?
30
+ * `.size` --- How many links are covered by the sample?
31
+ * `.duration` --- How long did the sample take, in seconds? (`end_time_s` - `start_time_s`)
32
+ * `.last_contiguous_id` --- The last id read from the database. Links yet to be completed equal (sample.size - last_contiguous_id) union (pending_links)
33
+ * `.size_on_disk` --- The approximate filesize on disk, in bytes, of all data in this sample
34
+ * `.dir` --- The directory for that sample, relative to the current working directory
35
+ * `.path` --- The filepath of the sample information file, relative to the current working directory
36
+ * `.datapoint` --- Datapoint-level variables
37
+ * `.id` --- The ID of the datapoint/link
38
+ * `.uri` --- The URI requested to acquire the data
39
+ * `.path` --- The full filepath of the datapoint file, relative to the current working directory
40
+ * `.dir` --- The directory in which the datapoint resides, relative to the current working directory
41
+ * `.client_id` --- The ID of the client that did the work
42
+ * `.error` --- Any errors reported during download
43
+ * `.headers` --- A hash containing the HTTP headers
44
+ * `.head` --- A string containing the HTTP headers
45
+ * `.body` --- The body content of the HTTP response
46
+ * `.response` --- The response object properties, as reported by cURL (Hash)
47
+ * `.round_trip_time` --- The total time for the request
48
+ * `.redirect_time` --- The time spend in redirects
49
+ * `.dns_lookup_time` --- The time spend looking up DNS
50
+ * `.effective_uri` --- The 'real' URI used, after redirects
51
+ * `.code` --- The response code
52
+ * `.download_speed` --- The download speed reported by cURL
53
+ * `.downloaded_bytes` --- The number of bytes downloaded, as reported by cURL
54
+ * `.encoding` --- The encoding, as reported by cURL. Note that this seems unreliable
55
+ * `.truncated` --- Boolean. `true` if the body was truncated due to the server's maximum filesize limit
56
+ * `.dry_run` --- Boolean. `true` if this datapoint was sampled as part of a dry run (no data will have been transferred to/from the web)
57
+ * `.mime_allowed` --- Boolean. `false` if the MIME type policy on the server caused this document's body to be discarded, or `true` otherwise
58
+
59
+
60
+ Calling the `.describe` method on any resource will output a tree containing its data, such as the one below, generated from a sample corpus:
61
+
62
+ Data{
63
+ server : {
64
+ links : [1, 2, 3]
65
+ complete_sampl...: 2
66
+ complete_samples : [0, 1]
67
+ next_sample_date : 1366381980
68
+ current_sample_id: 1
69
+ config : {
70
+ storage : {
71
+ root : corpus
72
+ state_file : state
73
+ sample_subdir : samples
74
+ sample_filename : sample
75
+ files_per_dir : 1000
76
+ database : {
77
+ filename : corpus/links.db
78
+ table : links
79
+ transaction_limit: 100
80
+ pragma : {
81
+ locking_mode : EXCLUSIVE
82
+ cache_size : 20000
83
+ synchronous : 0
84
+ temp_store : 2
85
+ }
86
+ fields : {
87
+ id : id
88
+ uri : uri
89
+ }
90
+ }
91
+ }
92
+ sampling_policy : {
93
+ sample_limit : 2
94
+ sample_time : 60
95
+ sample_alignment : 0
96
+ }
97
+ client_policy : {
98
+ dry_run : false
99
+ fix_encoding : true
100
+ target_encoding : UTF-8
101
+ encoding_options : {
102
+ invalid : replace
103
+ undef : replace
104
+ universal_newline: true
105
+ }
106
+ max_body_size : 20971520
107
+ mimes : {
108
+ policy : whitelist
109
+ ignore_case : true
110
+ list : ["^text\\/?.*$"]
111
+ }
112
+ curl_workers : {
113
+ max_redirects : 5
114
+ useragent : "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US...
115
+ enable_cookies : true
116
+ verbose : false
117
+ follow_location : true
118
+ timeout : 60
119
+ connect_timeout : 10
120
+ dns_cache_timeout: 10
121
+ ftp_response_t...: 10
122
+ }
123
+ }
124
+ client_management: {
125
+ time_per_link : 5
126
+ empty_client_b...: 60
127
+ delay_overesti...: 10
128
+ }
129
+ server : {
130
+ interfaces : [{:interface=>"localhost", :port=>27400}]
131
+ service_name : downloader
132
+ }
133
+ logging : {
134
+ progname : Server
135
+ logs : {
136
+ default : {
137
+ dev : STDOUT
138
+ level : info
139
+ }
140
+ file_log : {
141
+ dev : logs/server.log
142
+ level : info
143
+ }
144
+ }
145
+ }
146
+ }
147
+ version : 0.2.0b
148
+ }
149
+ sample : {
150
+ id : 1
151
+ start_time : 2013-04-19 15:33:10 +0100
152
+ end_time : 2013-04-19 15:33:11 +0100
153
+ complete : true
154
+ open : false
155
+ size : 3
156
+ duration : 1.406624844
157
+ start_time_s : 1366381990
158
+ end_time_s : 1366381991
159
+ size_on_disk : 214259.0
160
+ last_contiguou...: 3
161
+ dir : corpus/samples/1
162
+ path : corpus/samples/1/sample
163
+ }
164
+ datapoint : {
165
+ id : 3
166
+ uri : http://google.co.uk
167
+ dir : corpus/samples/1/0
168
+ path : corpus/samples/1/0/3
169
+ client_id : LOCAL3_7ba2f8cd03d79efbbaa4b1c561759c6e
170
+ error :
171
+ headers : {
172
+ Location : http://www.google.co.uk/
173
+ Content_Type : text/html; charset=UTF-8
174
+ Date : Fri, 19 Apr 2013 14:33:10 GMT
175
+ Expires : -1
176
+ Cache_Control : private, max-age=0
177
+ Server : gws
178
+ Content_Length : 221
179
+ X_XSS_Protection : 1; mode=block
180
+ X_Frame_Options : SAMEORIGIN
181
+ Set_Cookie : NID=67=B7dOglOF9YR3BvNje7Xgy_FAHcHIgJMW3HGm9HYI...
182
+ P3P : CP="This is not a P3P policy! See http://www.go...
183
+ Transfer_Encoding: chunked
184
+ }
185
+ head : HTTP/1.1 301 Moved Permanently\nLocation: http:/...
186
+ body : <!doctype html><html itemscope="itemscope" item...
187
+ response : {
188
+ round_trip_time : 0.313531
189
+ redirect_time : 0.219863
190
+ dns_lookup_time : 0.00129
191
+ effective_uri : http://www.google.co.uk/
192
+ code : 200
193
+ download_speed : 163125.0
194
+ downloaded_bytes : 51145.0
195
+ encoding : text/html; charset=UTF-8
196
+ truncated : false
197
+ mime_allowed : true
198
+ dry_run : false
199
+ }
200
+ }
201
+ }
202
+
203
+
204
+ Config
205
+ -------------
206
+ The export tool uses the server configuration to access a corpus, and loads it as if it were a server.
207
+
208
+ * `server_config` --- The path to the server configuration file.
209
+
210
+ For example:
211
+
212
+ :server_config: example_config/server.yml
213
+
214
+
215
+ Output
216
+ ------
217
+ Output is controlled using a filter/format system:
218
+
219
+ 1. Data is selected for export only if one of the filter expressions matches. Filters work at the server, sample, or datapoint level.
220
+ 2. Formatters transform the data for output, depending on your rules
221
+
222
+
223
+ * `formatter` --- The format system to use. Options are !!TODO!!
224
+ * `formatter_opts` --- Options to control the formatter in question. This is unique to each formatter, and they are all documented in their respective sections.
225
+ * `announce` --- How often to update the terminal with progress information
226
+ * `headers` --- Boolean. Should the script tell the formatter to output a header?
227
+ * `level` --- What level to export at. Possible values are `:server`, `:sample` or `:datapoint`. This is partially used for optimisation---exporting datapoint-level variables with `level` set to `:server` will result in them all being nil. See [concepts](concepts.html) for more information on what the levels correspond to within the LWAC system.
228
+
229
+ * `filters[]` --- This is outlined in its own section below...
230
+
231
+ For example:
232
+
233
+ :output:
234
+ :announce: 2000
235
+ :headers: true
236
+ :level: :datapoint
237
+ :formatter: ...see below...
238
+ :formatter_opts:
239
+ ...
240
+ :filters:
241
+ ...
242
+
243
+
244
+
245
+ ### Filtering
246
+ Filters are small scripts that, presented with data, return true to include a value in output, or false to discount it. Filters may operate at any level to exclude a certain `:server`, `:sample`, or `:datapoint`, and are defined in one of these three lists.
247
+
248
+ * `filters/server{}`, `filters/sample{}`, `filters/datapoint{}` --- Each entry in one of these lists should be an expression that evaluates to `true`/non-`nil` or `false`/`nil`.
249
+
250
+ Data access is governed by a 'data' object containing a hierachy of all available data at the given level. See 'Data Access' above for more information on how to refer to specific variables.
251
+
252
+ For example:
253
+
254
+ :filters:
255
+ :server:
256
+ :sample:
257
+ :test_filter: data.sample.id > 1 and data.sample.id < 3
258
+ :test_filter2: data.sample.id == 1
259
+ :datapoint:
260
+ :test_filter3: data.datapoint.id > 10
261
+
262
+
263
+
264
+ Formatters
265
+ ----------
266
+ Formatters are small scripts which transform the data into some usable format. Currently there are a few of these, and each has its own options:
267
+
268
+ * `:csv` --- Outputs to a single CSV file
269
+ * `:multicsv` --- Outputs to multiple CSV files (one per point)
270
+ * `:json` --- Outputs serialised JSON to a file (or a pipe) --- useful for IPC if you have non-ruby formatters
271
+ * `:multitemplate` --- Outputs to one ERB template per point, capable of rendering XML, HTML, etc
272
+ * `:multixml` --- XML output of all data for later transformation using XSLT
273
+
274
+ ### csv
275
+ The CSV formatter outputs a single CSV file at the level requested. It uses Ruby's FasterCSV implementation, and supports all of the options therein (such as changing separator, quote and line characters) as well as using the standardised field formatting routines outlined below:
276
+
277
+ * `filename` --- The filename to output to
278
+ * `csv_opts{}` --- A hash of CSV options, as conforming to the ruby specification [here](http://ruby-doc.org/stdlib-1.9.2/libdoc/csv/rdoc/CSV.html)
279
+ * `fields` --- A hash of key-expression sets conforming to the Field Formatting guidelines below
280
+
281
+ For example:
282
+
283
+ :formatter: :csv
284
+ :formatter_opts:
285
+ :filename: export.csv
286
+ :csv_opts:
287
+ :separator: "\t"
288
+ :fields:
289
+ ...
290
+
291
+ ### multicsv
292
+ The MultiCSV formatter is capable of producing one CSV file per point. Aside from the filename, it is otherwise identical to the CSV formatter:
293
+
294
+ * `filename` --- An expression that outputs the filename. Variables can easily be included in a string using ruby's `#{}` syntax: such as "/#{sample.id}/datapoint#{data.datapoint.id}.csv". Directories will be created if they don't already exist.
295
+ * `csv_opts{}` --- A hash of CSV options, as conforming to the ruby specification [here](http://ruby-doc.org/stdlib-1.9.2/libdoc/csv/rdoc/CSV.html)
296
+ * `fields` --- A hash of key-expression sets conforming to the Field Formatting guidelines below
297
+
298
+ For example:
299
+
300
+ :formatter: :multicsv
301
+ :formatter_opts:
302
+ :filename: exported_data/sample_#{data.sample.id}/#{data.datapoint.id}.csv
303
+ :csv_opts: # defaults
304
+ :fields:
305
+ ...
306
+
307
+ ### json
308
+ The JSON formatter is primarily designed to ship data elsewhere for processing by languages other than ruby. It writes to a single file, and flushes after each point has been written for use with named pipes.
309
+
310
+ * `filename` --- A string filename to write to
311
+ * `fields` --- A hash of key-expression sets conforming to the Field Formatting guidelines below
312
+
313
+ If `headers` is set to true, the formatter will output an array of headers as the first line, then it will output one point per line (separated using unix `\n` character) as an array thereafter.
314
+
315
+ For example:
316
+
317
+ :formatter: :json
318
+ :formatter_opts:
319
+ :filename: export.pipe
320
+ :fields:
321
+ ...
322
+
323
+ ### multitemplate
324
+ This runs a specified ERB template for each point. Since ERB templates are already powerful ways of including expressions and data cleaning, this formatter doesn't use the Field Formatting conventions, and thus supports more complex forms of output. It is the ideal way of exporting raw data, XML, or summaries to human-readable form, and a number of templates are provided in the example config for these purposes.
325
+
326
+ * `filename` --- An expression that outputs the filename. Variables can easily be included in a string using ruby's `#{}` syntax: such as "/#{sample.id}/datapoint#{data.datapoint.id}.csv". Directories will be created if they don't already exist.
327
+ * `template` --- The path to a template
328
+
329
+ For example:
330
+
331
+ :formatter: :multitemplate
332
+ :formatter_opts:
333
+ :filename: exported_data/sample_#{data.sample.id}/#{data.datapoint.id}.html
334
+ :template: example_config/export_template_html_sampler.erb
335
+
336
+ ### multixml
337
+ This exports all data to XML for each point, using REXML to handle the generation. This XML file may then be transformed into another format (i.e. TEI lite or similar using XSLT. The available options largely affect the style of output:
338
+
339
+ * `filename` --- An expression that outputs the filename. Variables can easily be included in a string using ruby's `#{}` syntax: such as "/#{sample.id}/datapoint#{data.datapoint.id}.csv". Directories will be created if they don't already exist.
340
+ * `xml_format` --- one of `:default`, `:pretty`, or `:whitespace`. The `:default` and `whitespace` options preserve document whitespace, whereas `:pretty` destroys it in the name of beauty.
341
+ * `xml_indent` --- The number of spaces to use as an indent. Use 0 to disable indenting. Note that this doesn't apply for the `:default` `xml_format`, which does not handle indenting.
342
+
343
+ Unfortunately, it is rather slow compared to the others. If you wish to use a specific XML format, I therefore recommend writing your own template using the multitemplate formatter.
344
+
345
+ For example:
346
+
347
+ :formatter: :multixml
348
+ :formatter_opts:
349
+ :filename: exported_data/sample_#{data.sample.id}/#{data.datapoint.id}.xml
350
+ :xml_format: :pretty
351
+ :xml_indent: 2
352
+
353
+
354
+ Field Formatting
355
+ ----------------
356
+ Key-value formatters, such as `:csv` and `:multicsv` use a common format system based on small scripts. The need to select data from the main set is generally complicated by the need to handle missing data, especially where data is acquired from the web and may be particularly messy. Variables may be formatted for output using one of three structures, each defined by a hash.
357
+
358
+ * `fields/FIELD_NAME{}` --- Output the result of the hash contained in `FIELD_NAME` as `FIELD_NAME`. The contents of the `FIELD_NAME` hash may be one of the formats detailed below
359
+
360
+ ### Simple Variable Formatting
361
+ This is the simplest way of output a value, and should work in most instances. To use it, simply specify the variable name (the `data.` prefix is optional), for example:
362
+
363
+ For example:
364
+
365
+ :format:
366
+ :sample_id: sample.id
367
+ :link_id: datapoint.id
368
+
369
+ This will output a CSV with two columns, `sample_id` and `link_id`.
370
+
371
+
372
+ ### Variable-and-condition
373
+ This will output the contents of a given variable if and only if a condition is true, and may be used to ensure that certain values are reported as missing. It is defined as a hash containing three properties:
374
+
375
+ * `FIELD_NAME/var` --- The variable in question. The preceeding `data.` may be omitted, as with simple variable formatting above.
376
+ * `FIELD_NAME/condition` --- An expression that evaluates to true if the value is to be output. The value in question will be called `x` in the expression.
377
+ * `FIELD_NAME/missing` --- A value to output if the expression above evaluates to false.
378
+
379
+ For example:
380
+
381
+ :format:
382
+ :redirect_time:
383
+ :var: datapoint.response.redirect_time
384
+ :condition: "(x and x.to_f > 0)"
385
+ :missing: ""
386
+
387
+ This ensures that the `redirect_time` field is only populated if it is non-`nil` and contains a value over zero.
388
+
389
+ ### Expression-based Formatting
390
+ The most powerful, and complex, form of formatting relies on a free-form ruby expression to return a value for output. The expression in question is provided as a string, as with other expression objects, and may handle any variables within the export tool whilst processing. There is only one entry in the hash required:
391
+
392
+ * `FIELD_NAME/expr` --- The expression to be used. *Must* return a value.
393
+
394
+ For example:
395
+
396
+ :format:
397
+ :okay_resp:
398
+ :expr: "data.datapoint.response.code.to_i == 200"
399
+ :redirect_proportion: # A long expression on multiple lines
400
+ :expr: >
401
+ r = data.datapoint.response
402
+
403
+ if(r.redirect_time and r.redirect_time > 0) then
404
+ return r.redirect_time.to_f / r.round_trip_time.to_f
405
+ else
406
+ return "NA"
407
+ end
408
+
409
+ This example outputs two fields. The former, `okay_resp`, outputs 'true' if the response code was 200. The latter, which uses YAML's multi-line string syntax, computes the redirect time as a proportion of the total request time, as a measure of 'how redirected' something was, and returns "NA" if the times are unavailable.
410
+
411
+ For example:
412
+
413
+ :fields:
414
+ ...
415
+ :sample_id: sample.id # Make the 'sample_id' field contain the sample.id variable.
416
+ :link_id: datapoint.id # link_id will contain the datapoint.id variable, etc...
417
+ :link_uri: datapoint.uri
418
+ :redirected: # 'redirected' field will include the output of the expression below
419
+ :expr: "return (data.datapoint.response.effective_uri and data.datapoint.uri.chomp('/') == data.datapoint.response.effective_uri.chomp('/'))"
420
+ :dns_time: datapoint.response.dns_lookup_time
421
+ :redirect_time: # 'redirect_time' will contain the variable datapoint.response.redirect_time only if condition evaluates to true, and "" otherwise
422
+ :var: datapoint.response.redirect_time
423
+ :condition: "(x and x.to_f > 0)"
424
+ :missing: ""
425
+ :rtt: datapoint.response.round_trip_time
426
+ :response_code: datapoint.response.code
427
+ :imperfect:
428
+ :expr: "data.datapoint.response.code.to_i == 200"
429
+ :redirect_proportion: # A long expression on multiple lines
430
+ :expr: >
431
+ r = data.datapoint.response
432
+
433
+ if(r.redirect_time and r.redirect_time > 0) then
434
+ return r.redirect_time.to_f / r.round_trip_time.to_f
435
+ else
436
+ return "NA"
437
+ end
438
+ :sample_file: sample.path
439
+ :sample_dir: sample.dir
440
+ :datapoint_dir: datapoint.dir
441
+ :datapoint_path: datapoint.path
442
+ :last_id: sample.last_contiguous_id
443
+ # :raw: datapoint.body # Body content
444
+
445
+ Logging
446
+ -------
447
+ The logging system is the same for client, server, and export tools and shares a configuration with them. For details, see [configuring logging](log_config.html)
448
+