logbox 0.2.10

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. data/.bundle/config +3 -0
  2. data/.rvmrc +2 -0
  3. data/Gemfile +17 -0
  4. data/Gemfile.lock +30 -0
  5. data/README +14 -0
  6. data/Rakefile +74 -0
  7. data/VERSION +1 -0
  8. data/bin/download_logs +20 -0
  9. data/bin/obsstats +39 -0
  10. data/bin/rotate +17 -0
  11. data/bin/viewobs +198 -0
  12. data/lib/logbox.rb +9 -0
  13. data/lib/logbox/ansi_colors.rb +28 -0
  14. data/lib/logbox/log_parser.rb +79 -0
  15. data/lib/logbox/mockup_log.rb +44 -0
  16. data/lib/logbox/observation.rb +162 -0
  17. data/lib/logbox/observation_compiler.rb +311 -0
  18. data/lib/logbox/observation_mover.rb +142 -0
  19. data/lib/logbox/stream_wrapper.rb +20 -0
  20. data/lib/logbox/stream_wrapper/gzip_multi_file.rb +90 -0
  21. data/lib/logbox/stream_wrapper/observation_filter.rb +113 -0
  22. data/lib/logbox/stream_wrapper/order_blob_splitter.rb +96 -0
  23. data/lib/setup_environment.rb +15 -0
  24. data/logbox.gemspec +110 -0
  25. data/test/bin_viewobs_test.rb +42 -0
  26. data/test/fixtures/aws_keys_yaml.txt +3 -0
  27. data/test/fixtures/double-obs.log +1 -0
  28. data/test/fixtures/error_line.log +1 -0
  29. data/test/fixtures/log-for-md5.log +1 -0
  30. data/test/fixtures/log0.log +0 -0
  31. data/test/fixtures/log1.log +1 -0
  32. data/test/fixtures/log1.log.gz +0 -0
  33. data/test/fixtures/log2.log +2 -0
  34. data/test/fixtures/log2.log.gz +0 -0
  35. data/test/fixtures/log_invalid_mixed_encoding.log +1 -0
  36. data/test/fixtures/observation_filter.log +5 -0
  37. data/test/fixtures/unquoted_ugliness.log +2 -0
  38. data/test/log_parser_test.rb +84 -0
  39. data/test/observation_compiler_test.rb +216 -0
  40. data/test/observation_mover_test.rb +135 -0
  41. data/test/observation_test.rb +114 -0
  42. data/test/stream_wrapper/gzip_multi_file_test.rb +147 -0
  43. data/test/stream_wrapper/observation_filter_test.rb +171 -0
  44. data/test/stream_wrapper/order_blob_splitter_test.rb +129 -0
  45. data/test/test_helper.rb +23 -0
  46. metadata +177 -0
@@ -0,0 +1,142 @@
1
+ require 'digest/sha1'
2
+ require 'digest/md5'
3
+ require 'yaml'
4
+
5
+ require 'rubygems'
6
+ require 'single_instance'
7
+ require 'right_aws'
8
+
9
+ class ObservationMover
10
+
11
+ DEFAULT_KEY_FILE = '/etc/s3_key.yml'
12
+ DEFAULT_BUCKET = 'rwdata-logs'
13
+
14
+ def initialize (path, pid_file = '/var/run/nginx.pid')
15
+ @dir, @file_name = File.split(path)
16
+ @basename = File.basename(@file_name, File.extname(@file_name))
17
+ @pid_file = pid_file
18
+ end
19
+
20
+ def file_path
21
+ File.join(@dir, @file_name)
22
+ end
23
+
24
+ def run
25
+ SingleInstance.exclusive_non_blocking(:move_observations) do
26
+ prepare_rotated_for_archiving
27
+ move_prepared_to_archive
28
+ rotate_current && notify_logger
29
+ end
30
+ end
31
+
32
+ def move_prepared_to_archive
33
+ prepared_for_archiving.each do |file|
34
+ store_file_on_s3(file) && delete_file(file)
35
+ end
36
+ end
37
+
38
+ # TODO: report transmission errors:
39
+ # W, [2010-04-23T09:10:49.396732 #30275] WARN -- : ##### RightAws::S3Interface returned an error: 400 Bad Request
40
+ # <?xml version="1.0" encoding="UTF-8"?>
41
+ # <Error><Code>BadDigest</Code><Message>The Content-MD5 you specified did not match what we received.</Message><ExpectedDigest>5f7d35353204db919b646cd3eeeedca2</ExpectedDigest><CalculatedDigest>X301NTIE25GbZGzT7u7cog==</CalculatedDigest><RequestId>CA99C7EA27C5563D</RequestId><HostId>n125lcdYZDbNTV1Wy8OSfL9W5itYbB7wJtLHCo0Uzq/gKfXbu6hzUzwOP9dgkKXy</HostId></Error> #####
42
+ # W, [2010-04-23T09:10:49.396788 #30275] WARN -- : ##### RightAws::S3Interface request: rwdata-logs.s3.amazonaws.com:443/observer-log-2010-04-17-23-20-01-4ef6cbdef683b156 ####
43
+ # /usr/lib/ruby/gems/1.8/gems/right_aws-1.10.0/lib/awsbase/right_awsbase.rb:359:in `request_info_impl': BadDigest: The Content-MD5 you specified did not match what we received. (RightAws::AwsError)
44
+
45
+ def store_file_on_s3 (file)
46
+ File.open file do |f|
47
+ s3.put(bucket, File.basename(file), f, "Content-MD5" => md5(file), 'Content-Type' => 'text/plain')
48
+ end
49
+ true
50
+ end
51
+
52
+ def delete_file (file)
53
+ File.unlink(file)
54
+ end
55
+
56
+ def prepare_rotated_for_archiving
57
+ Dir[rotated_glob].each do |file|
58
+ signed_name = "#{file}-#{short_sha(file)}"
59
+ File.rename file, signed_name
60
+ system "gzip", signed_name
61
+ end
62
+ end
63
+
64
+ def rotate_current
65
+ if File.size?(file_path)
66
+ rename_current
67
+ create_new
68
+ true
69
+ else
70
+ false
71
+ end
72
+ end
73
+
74
+ def notify_logger
75
+ Process.kill 'USR1', File.read(@pid_file).to_i if File.exists?(@pid_file)
76
+ end
77
+
78
+ ##
79
+
80
+ def s3
81
+ @s3 ||= RightAws::S3.new(*aws_keys).interface
82
+ end
83
+
84
+ def aws_keys (file = key_file)
85
+ hash = YAML.load_file(file)
86
+ [hash[:access_key_id], hash[:secret_access_key]]
87
+ end
88
+
89
+ def md5 (path)
90
+ [Digest::MD5.file(path).digest].pack('m').chomp
91
+ end
92
+
93
+ def key_file
94
+ @key_file ||= DEFAULT_KEY_FILE
95
+ end
96
+ attr_writer :key_file
97
+
98
+ def bucket
99
+ @bucket ||= DEFAULT_BUCKET
100
+ end
101
+ attr_writer :bucket
102
+
103
+ def short_sha (file)
104
+ Digest::SHA1.file(file).hexdigest[0,16]
105
+ end
106
+
107
+ def rename_current
108
+ File.rename file_path, rotated_path if File.exists?(file_path)
109
+ end
110
+
111
+ def create_new
112
+ File.new(file_path, "a", 0644).close
113
+ end
114
+
115
+ TIME_FORMAT = '-%Y-%m-%d-%H-%M-%S'
116
+
117
+ def rotated_path
118
+ File.join(@dir, @file_name.gsub('.','-') + Time.now.strftime(TIME_FORMAT))
119
+ end
120
+
121
+ def rotated_glob
122
+ file_part = @file_name.gsub('.','-')
123
+ time_part = Time.now.strftime(TIME_FORMAT).gsub(/\d/,'[0-9]')
124
+ File.join(@dir, file_part + time_part)
125
+ end
126
+
127
+ def prepared_for_archiving
128
+ glob = rotated_glob + '-' + ('[0-9a-f]' * 16) + ".gz"
129
+ Dir[glob]
130
+ end
131
+
132
+ def self.act_as_fake_nginx
133
+ Dir.mkdir 'testlogs' unless File.exists?('testlogs')
134
+ File.open('testlogs/test.log', 'a') { |f| f.puts "loggy" }
135
+ File.open('test.pid', 'w') { |f| f.print $$ }
136
+ Signal.trap("USR1") { puts "USR1 @ #{Time.now}" }
137
+ puts "Listening for USR1s"
138
+ 10.times { sleep 10}
139
+ puts "Done listening for USR1s"
140
+ end
141
+
142
+ end
@@ -0,0 +1,20 @@
1
+ $: << File.dirname(__FILE__)
2
+ require "stream_wrapper/gzip_multi_file"
3
+ require "stream_wrapper/observation_filter"
4
+ require "stream_wrapper/order_blob_splitter"
5
+
6
+ module StreamWrapper
7
+
8
+ class << self
9
+
10
+ def open(paths = [], filters = nil)
11
+ if filters
12
+ ObservationFilter.new(OrderBlobSplitter.new(GzipMultiFile.new(paths, filters)), filters)
13
+ else
14
+ OrderBlobSplitter.new(GzipMultiFile.new(paths))
15
+ end
16
+ end
17
+
18
+ end
19
+
20
+ end
@@ -0,0 +1,90 @@
1
+ module StreamWrapper
2
+ class GzipMultiFile
3
+
4
+ def initialize (paths = [], filters = {})
5
+ @paths = paths.is_a?(Array) ? paths.dup : [paths]
6
+ @filters = filters
7
+ if @paths.empty?
8
+ @current_line = 0
9
+ @current_file = $stdin
10
+ @current_path = '<stdin>'
11
+ end
12
+ end
13
+
14
+ def gets
15
+ if file = current_stream
16
+ @current_line += 1
17
+ file.gets
18
+ else
19
+ nil
20
+ end
21
+ end
22
+
23
+ def eof?
24
+ ! current_stream
25
+ end
26
+
27
+ def close
28
+ @current_file.close unless @current_file.nil?
29
+ end
30
+
31
+ def current_position
32
+ "#{@current_path}:#{@current_line}"
33
+ end
34
+
35
+ def debug= (bool)
36
+ @debug = bool
37
+ end
38
+
39
+ def debug?
40
+ @debug
41
+ end
42
+
43
+ protected
44
+
45
+ def log (message)
46
+ $stderr.puts message if debug?
47
+ end
48
+
49
+ def current_stream
50
+ if @current_file.nil? || @current_file.eof?
51
+ close
52
+ @current_file = next_available_file
53
+ end
54
+ @current_file
55
+ end
56
+
57
+ def next_available_file
58
+ if @current_path
59
+ log "Closed #{@current_path} after reading #{@current_line} lines"
60
+ end
61
+ @current_line = 0
62
+ @current_path = @paths.shift or return nil
63
+ log "Switching to #{@current_path}"
64
+ file = IO.popen(shell_command(@current_path))
65
+ file.eof? ? next_available_file : file
66
+ end
67
+
68
+ # Use whitelist to be sure. Observation filters is applied after anyways.
69
+ SHOP_AND_ACCOUNT_ID_FILTER = /^[a-zA-Z0-9._]+$/
70
+
71
+ def shell_command (path)
72
+ if path =~ /\.gz/
73
+ command = "gunzip -c #{path.gsub(' ','\\ ')}"
74
+ else
75
+ command = "cat #{path.gsub(' ','\\ ')}"
76
+ end
77
+ if @filters[:shop_id] && @filters[:shop_id] =~ SHOP_AND_ACCOUNT_ID_FILTER
78
+ command += " | grep -F sid=#{@filters[:shop_id]}"
79
+ end
80
+ if @filters[:account_id] && @filters[:account_id] =~ SHOP_AND_ACCOUNT_ID_FILTER
81
+ command += " | grep -F aid=#{@filters[:account_id]}"
82
+ end
83
+ if @filters[:observation_type]
84
+ command += " | grep -F o=#{@filters[:observation_type]}"
85
+ end
86
+ command
87
+ end
88
+
89
+ end
90
+ end
@@ -0,0 +1,113 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'uri'
3
+ require 'observation'
4
+ require 'logbox'
5
+
6
+ module StreamWrapper
7
+ class ObservationFilter
8
+ include Enumerable
9
+ attr_reader :observation_count
10
+
11
+ # Support both strings and streams as input.
12
+ def initialize(input, filter_options)
13
+ input = StringIO.new(input) if input.class == String
14
+ @stream = input
15
+ # Quick check of options. It would be easy to set :valid => false but it is not allowed.
16
+ if filter_options.detect { |key, value| value == false }
17
+ raise "A filter option can not have false as value. Use skip instead."
18
+ end
19
+ @filter_options = filter_options
20
+ build_filters @filter_options
21
+ end
22
+
23
+ # Get next row and remove invalid utf-8 byte sequences.
24
+ def gets
25
+ while row = @stream.gets
26
+ row = Logbox::StringEncoder.iconv(row)
27
+ break if keep? row
28
+ end
29
+ row
30
+ end
31
+
32
+ def eof?
33
+ @stream.eof?
34
+ end
35
+
36
+ # Enumerate over Observations
37
+ def each
38
+ @observation_count = 0
39
+ while(row = gets)
40
+ begin
41
+ attrs = LogParser.parse_line(row) || next
42
+ rescue LogParser::ParseError => e
43
+ $stderr.puts("#{e} for row:#{row}")
44
+ next
45
+ end
46
+ observation = Observation.new(attrs)
47
+ if (@filter_options[:valid] && observation.valid?) ||
48
+ (@filter_options[:skip_valid] && !observation.valid?) ||
49
+ (@filter_options[:valid].nil? && @filter_options[:skip_valid].nil?)
50
+ yield observation
51
+ @observation_count += 1
52
+ end
53
+ end
54
+ end
55
+
56
+ protected
57
+
58
+ def build_filters options
59
+ @filters = []
60
+ options.keys.each do |full_key|
61
+ # Filter keys can be of format :account_id or :skip_account_id.
62
+ key_fragments = /^(skip_)?(.*)$/.match(full_key.to_s)
63
+ key = key_fragments[2].to_sym
64
+ skip = (key_fragments[1] == "skip_")
65
+
66
+ if options[full_key] != true
67
+ # Expressions are url escaped in log file. Encode same characters as
68
+ # JavaScript (as far as we can know).
69
+ # See https://developer.mozilla.org/en/Core_JavaScript_1.5_Reference/Global_Functions/encodeURIComponent
70
+ expression = URI.escape(options[full_key].to_s, /[^-_.!~*'()a-zA-Z\d]/n)
71
+
72
+ if key == :account_id
73
+ @filters << [skip, Regexp.new(Regexp.escape("aid=#{expression}"), true)]
74
+ end
75
+
76
+ if key == :shop_id
77
+ @filters << [skip, Regexp.new(Regexp.escape("sid=#{expression}"), true)]
78
+ end
79
+
80
+ if key == :observation_type
81
+ @filters << [skip, Regexp.new(Regexp.escape("o=#{expression}"), true)]
82
+ end
83
+ end
84
+
85
+ if key == :debug_observations && options[full_key] == true
86
+ if skip
87
+ # When SKIPPING tests, we can have SEVERAL regular expressions.
88
+ # Watch out for debug flag, but also known ips.
89
+ @filters << [skip, /^81\.231\.246\.117/] # ICU Intellegence
90
+ @filters << [skip, /^84\.218\.38\.122/] # Nils Svangård
91
+ @filters << [skip, /d=debug/]
92
+ else
93
+ @filters << [skip, /d=debug/]
94
+ end
95
+ end
96
+ end
97
+ end
98
+
99
+ def keep? row
100
+ # All filters must match to accept row.
101
+ @filters.each do |filter|
102
+ skip, regexp = filter
103
+ if skip
104
+ return false if row =~ regexp
105
+ else
106
+ return false unless row =~ regexp
107
+ end
108
+ end
109
+ true
110
+ end
111
+ end
112
+ end
113
+
@@ -0,0 +1,96 @@
1
+ # -*- encoding: utf-8 -*-
2
+ module StreamWrapper
3
+
4
+ # Modfies and injects buy_basket blob information into stream.
5
+ # 1. Inserts attributes from buy_basket blob into the buy_basket observation.
6
+ # 2. Adds a buy_item observation for each item in the basket.
7
+ class OrderBlobSplitter
8
+
9
+ def initialize (input)
10
+ input = StringIO.new(input) if input.class == String # Used for testing.
11
+ @stream = input
12
+ @buy_item_lines = []
13
+ end
14
+
15
+ def gets
16
+ if @buy_item_lines.size > 0
17
+ return @buy_item_lines.shift
18
+ end
19
+
20
+ line = @stream.gets
21
+ if line && line.include?("&o=buy_basket&")
22
+ extract_buy_items(line)
23
+ line = reformat_buy_basket(line)
24
+ end
25
+ line
26
+ end
27
+
28
+ def eof?
29
+ @stream.eof?
30
+ end
31
+
32
+ private
33
+
34
+ # Blob format:
35
+ # RW:T|2142||1079.00|215.80|0.00|strömstad||Sweden|
36
+ # RW:I|2142|MATAW09 Wanni L|Overall (98)||1079.00|1|
37
+ #
38
+ # See also http://www.google.com/support/googleanalytics/bin/answer.py?hl=en&answer=55528
39
+
40
+ def reformat_buy_basket(line)
41
+ blob = extract_order_blob(line)
42
+ order_blob = blob.match(/^RW:T\|(([^\|]*\|[^\|]*){7})\|/)[1]
43
+ values = order_blob.split("|")
44
+ attributes = {}
45
+ [:order_id, :affiliation, :total, :tax, :shipping, :city, :state, :country].each_with_index do |name, index|
46
+ attributes[name] = values[index]
47
+ end
48
+ insert_user_attributes(line, attributes)
49
+ rescue
50
+ line
51
+ end
52
+
53
+ def extract_buy_items(line)
54
+ blob = extract_order_blob(line)
55
+ blob.scan(/RW:I\|(([^\|]*\|[^\|]*){5})\|/) do |item_blob|
56
+ values = item_blob[0].split("|")
57
+ attributes = {}
58
+ [:order_id, :sku, :title, :category, :current_price, :quantity].each_with_index do |name, index|
59
+ attributes[name] = values[index]
60
+ end
61
+ @buy_item_lines << insert_user_attributes(line, attributes, :replace).gsub("&o=buy_basket", "&o=buy_item")
62
+ end
63
+ end
64
+
65
+ def extract_order_blob(line)
66
+ CGI::unescape(line.match(/_order_blob=([^&]+)&/)[1]) rescue ""
67
+ end
68
+
69
+ # 81.225.100.141 - - [23/Nov/2009:06:45:23 +0100] "GET /log.gif?_order_blob=RW%3AT%7C2142%7C%7C1079.00%7C215.80%7C0.00%7Cstr%C3%B6mstad%7C%7CSweden%7C%20%0ARW%3AI%7C2142%7CMATAW09%20Wanni%20L%7COverall%20(98)%7C%7C1079.00%7C1%7C&_order_nr=002142&a=Mozilla%2F4.0%20(compatible%3B%20MSIE%208.0%3B%20Windows%20NT%205.1%3B%20Trident%2F4.0%3B%20Mozilla%2F4.0%20(compatible%3B%20MSIE%206.0%3B%20Windows%20NT%205.1%3B%20SV1)%20%3B%20.NET%20CLR%202.0.50727%3B%20.NET%20CLR%203.0.4506.2152%3B%20.NET%20CLR%203.5.30729)&aid=jetshop&l=sv&n=microsoft%20internet%20explorer&o=buy_basket&p=win32&s=1024x600&sid=www.pixiekids.se&t=Unika%20barnkl%C3%A4der%20%26%20babykl%C3%A4der%20hos%20Pixiekids%20%E2%8E%AE%200-10%20%C3%A5r&u=https%3A%2F%2Fwww.pixiekids.se%2FOrderDetailsConfirmed.aspx&uid=1258954551578656003&x=87544&z=-60& HTTP/1.1" 200 35 "https://www.pixiekids.se/OrderDetailsConfirmed.aspx" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)"
70
+ def insert_user_attributes(line, attributes, method = :merge)
71
+ encoded_attributes = []
72
+ attributes_regexp = /\?(.*?)&(?!_)/
73
+ # Add existing attributes from line.
74
+ unless method == :replace
75
+ encoded_attributes = line.match(attributes_regexp)[1].split("&").map { |pair| pair.split("=") }
76
+ end
77
+ # Add in attributes from method argument.
78
+ attributes.each do |key, value|
79
+ next if value.nil? || value == ""
80
+ encoded_key = CGI.escape("_#{key}")
81
+ encoded_value = CGI.escape(value)
82
+ encoded_attributes.reject!{ |k, v| k == encoded_key }
83
+ encoded_attributes << [encoded_key, encoded_value]
84
+ end
85
+ # Reconstruct the user attributes part of the line.
86
+ attributes_string = encoded_attributes.sort.map do |key, value|
87
+ "#{key}=#{value}&"
88
+ end.join
89
+ # Replace the old user attributes in the line with the new ones.
90
+ line.sub(attributes_regexp, "?#{attributes_string}")
91
+ end
92
+
93
+ end
94
+ end
95
+
96
+