logbox 0.2.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. data/.bundle/config +3 -0
  2. data/.rvmrc +2 -0
  3. data/Gemfile +17 -0
  4. data/Gemfile.lock +30 -0
  5. data/README +14 -0
  6. data/Rakefile +74 -0
  7. data/VERSION +1 -0
  8. data/bin/download_logs +20 -0
  9. data/bin/obsstats +39 -0
  10. data/bin/rotate +17 -0
  11. data/bin/viewobs +198 -0
  12. data/lib/logbox.rb +9 -0
  13. data/lib/logbox/ansi_colors.rb +28 -0
  14. data/lib/logbox/log_parser.rb +79 -0
  15. data/lib/logbox/mockup_log.rb +44 -0
  16. data/lib/logbox/observation.rb +162 -0
  17. data/lib/logbox/observation_compiler.rb +311 -0
  18. data/lib/logbox/observation_mover.rb +142 -0
  19. data/lib/logbox/stream_wrapper.rb +20 -0
  20. data/lib/logbox/stream_wrapper/gzip_multi_file.rb +90 -0
  21. data/lib/logbox/stream_wrapper/observation_filter.rb +113 -0
  22. data/lib/logbox/stream_wrapper/order_blob_splitter.rb +96 -0
  23. data/lib/setup_environment.rb +15 -0
  24. data/logbox.gemspec +110 -0
  25. data/test/bin_viewobs_test.rb +42 -0
  26. data/test/fixtures/aws_keys_yaml.txt +3 -0
  27. data/test/fixtures/double-obs.log +1 -0
  28. data/test/fixtures/error_line.log +1 -0
  29. data/test/fixtures/log-for-md5.log +1 -0
  30. data/test/fixtures/log0.log +0 -0
  31. data/test/fixtures/log1.log +1 -0
  32. data/test/fixtures/log1.log.gz +0 -0
  33. data/test/fixtures/log2.log +2 -0
  34. data/test/fixtures/log2.log.gz +0 -0
  35. data/test/fixtures/log_invalid_mixed_encoding.log +1 -0
  36. data/test/fixtures/observation_filter.log +5 -0
  37. data/test/fixtures/unquoted_ugliness.log +2 -0
  38. data/test/log_parser_test.rb +84 -0
  39. data/test/observation_compiler_test.rb +216 -0
  40. data/test/observation_mover_test.rb +135 -0
  41. data/test/observation_test.rb +114 -0
  42. data/test/stream_wrapper/gzip_multi_file_test.rb +147 -0
  43. data/test/stream_wrapper/observation_filter_test.rb +171 -0
  44. data/test/stream_wrapper/order_blob_splitter_test.rb +129 -0
  45. data/test/test_helper.rb +23 -0
  46. metadata +177 -0
@@ -0,0 +1,142 @@
1
+ require 'digest/sha1'
2
+ require 'digest/md5'
3
+ require 'yaml'
4
+
5
+ require 'rubygems'
6
+ require 'single_instance'
7
+ require 'right_aws'
8
+
9
+ class ObservationMover
10
+
11
+ DEFAULT_KEY_FILE = '/etc/s3_key.yml'
12
+ DEFAULT_BUCKET = 'rwdata-logs'
13
+
14
+ def initialize (path, pid_file = '/var/run/nginx.pid')
15
+ @dir, @file_name = File.split(path)
16
+ @basename = File.basename(@file_name, File.extname(@file_name))
17
+ @pid_file = pid_file
18
+ end
19
+
20
+ def file_path
21
+ File.join(@dir, @file_name)
22
+ end
23
+
24
+ def run
25
+ SingleInstance.exclusive_non_blocking(:move_observations) do
26
+ prepare_rotated_for_archiving
27
+ move_prepared_to_archive
28
+ rotate_current && notify_logger
29
+ end
30
+ end
31
+
32
+ def move_prepared_to_archive
33
+ prepared_for_archiving.each do |file|
34
+ store_file_on_s3(file) && delete_file(file)
35
+ end
36
+ end
37
+
38
+ # TODO: report transmission errors:
39
+ # W, [2010-04-23T09:10:49.396732 #30275] WARN -- : ##### RightAws::S3Interface returned an error: 400 Bad Request
40
+ # <?xml version="1.0" encoding="UTF-8"?>
41
+ # <Error><Code>BadDigest</Code><Message>The Content-MD5 you specified did not match what we received.</Message><ExpectedDigest>5f7d35353204db919b646cd3eeeedca2</ExpectedDigest><CalculatedDigest>X301NTIE25GbZGzT7u7cog==</CalculatedDigest><RequestId>CA99C7EA27C5563D</RequestId><HostId>n125lcdYZDbNTV1Wy8OSfL9W5itYbB7wJtLHCo0Uzq/gKfXbu6hzUzwOP9dgkKXy</HostId></Error> #####
42
+ # W, [2010-04-23T09:10:49.396788 #30275] WARN -- : ##### RightAws::S3Interface request: rwdata-logs.s3.amazonaws.com:443/observer-log-2010-04-17-23-20-01-4ef6cbdef683b156 ####
43
+ # /usr/lib/ruby/gems/1.8/gems/right_aws-1.10.0/lib/awsbase/right_awsbase.rb:359:in `request_info_impl': BadDigest: The Content-MD5 you specified did not match what we received. (RightAws::AwsError)
44
+
45
+ def store_file_on_s3 (file)
46
+ File.open file do |f|
47
+ s3.put(bucket, File.basename(file), f, "Content-MD5" => md5(file), 'Content-Type' => 'text/plain')
48
+ end
49
+ true
50
+ end
51
+
52
+ def delete_file (file)
53
+ File.unlink(file)
54
+ end
55
+
56
+ def prepare_rotated_for_archiving
57
+ Dir[rotated_glob].each do |file|
58
+ signed_name = "#{file}-#{short_sha(file)}"
59
+ File.rename file, signed_name
60
+ system "gzip", signed_name
61
+ end
62
+ end
63
+
64
+ def rotate_current
65
+ if File.size?(file_path)
66
+ rename_current
67
+ create_new
68
+ true
69
+ else
70
+ false
71
+ end
72
+ end
73
+
74
+ def notify_logger
75
+ Process.kill 'USR1', File.read(@pid_file).to_i if File.exists?(@pid_file)
76
+ end
77
+
78
+ ##
79
+
80
+ def s3
81
+ @s3 ||= RightAws::S3.new(*aws_keys).interface
82
+ end
83
+
84
+ def aws_keys (file = key_file)
85
+ hash = YAML.load_file(file)
86
+ [hash[:access_key_id], hash[:secret_access_key]]
87
+ end
88
+
89
+ def md5 (path)
90
+ [Digest::MD5.file(path).digest].pack('m').chomp
91
+ end
92
+
93
+ def key_file
94
+ @key_file ||= DEFAULT_KEY_FILE
95
+ end
96
+ attr_writer :key_file
97
+
98
+ def bucket
99
+ @bucket ||= DEFAULT_BUCKET
100
+ end
101
+ attr_writer :bucket
102
+
103
+ def short_sha (file)
104
+ Digest::SHA1.file(file).hexdigest[0,16]
105
+ end
106
+
107
+ def rename_current
108
+ File.rename file_path, rotated_path if File.exists?(file_path)
109
+ end
110
+
111
+ def create_new
112
+ File.new(file_path, "a", 0644).close
113
+ end
114
+
115
+ TIME_FORMAT = '-%Y-%m-%d-%H-%M-%S'
116
+
117
+ def rotated_path
118
+ File.join(@dir, @file_name.gsub('.','-') + Time.now.strftime(TIME_FORMAT))
119
+ end
120
+
121
+ def rotated_glob
122
+ file_part = @file_name.gsub('.','-')
123
+ time_part = Time.now.strftime(TIME_FORMAT).gsub(/\d/,'[0-9]')
124
+ File.join(@dir, file_part + time_part)
125
+ end
126
+
127
+ def prepared_for_archiving
128
+ glob = rotated_glob + '-' + ('[0-9a-f]' * 16) + ".gz"
129
+ Dir[glob]
130
+ end
131
+
132
+ def self.act_as_fake_nginx
133
+ Dir.mkdir 'testlogs' unless File.exists?('testlogs')
134
+ File.open('testlogs/test.log', 'a') { |f| f.puts "loggy" }
135
+ File.open('test.pid', 'w') { |f| f.print $$ }
136
+ Signal.trap("USR1") { puts "USR1 @ #{Time.now}" }
137
+ puts "Listening for USR1s"
138
+ 10.times { sleep 10}
139
+ puts "Done listening for USR1s"
140
+ end
141
+
142
+ end
@@ -0,0 +1,20 @@
1
+ $: << File.dirname(__FILE__)
2
+ require "stream_wrapper/gzip_multi_file"
3
+ require "stream_wrapper/observation_filter"
4
+ require "stream_wrapper/order_blob_splitter"
5
+
6
+ module StreamWrapper
7
+
8
+ class << self
9
+
10
+ def open(paths = [], filters = nil)
11
+ if filters
12
+ ObservationFilter.new(OrderBlobSplitter.new(GzipMultiFile.new(paths, filters)), filters)
13
+ else
14
+ OrderBlobSplitter.new(GzipMultiFile.new(paths))
15
+ end
16
+ end
17
+
18
+ end
19
+
20
+ end
@@ -0,0 +1,90 @@
1
+ module StreamWrapper
2
+ class GzipMultiFile
3
+
4
+ def initialize (paths = [], filters = {})
5
+ @paths = paths.is_a?(Array) ? paths.dup : [paths]
6
+ @filters = filters
7
+ if @paths.empty?
8
+ @current_line = 0
9
+ @current_file = $stdin
10
+ @current_path = '<stdin>'
11
+ end
12
+ end
13
+
14
+ def gets
15
+ if file = current_stream
16
+ @current_line += 1
17
+ file.gets
18
+ else
19
+ nil
20
+ end
21
+ end
22
+
23
+ def eof?
24
+ ! current_stream
25
+ end
26
+
27
+ def close
28
+ @current_file.close unless @current_file.nil?
29
+ end
30
+
31
+ def current_position
32
+ "#{@current_path}:#{@current_line}"
33
+ end
34
+
35
+ def debug= (bool)
36
+ @debug = bool
37
+ end
38
+
39
+ def debug?
40
+ @debug
41
+ end
42
+
43
+ protected
44
+
45
+ def log (message)
46
+ $stderr.puts message if debug?
47
+ end
48
+
49
+ def current_stream
50
+ if @current_file.nil? || @current_file.eof?
51
+ close
52
+ @current_file = next_available_file
53
+ end
54
+ @current_file
55
+ end
56
+
57
+ def next_available_file
58
+ if @current_path
59
+ log "Closed #{@current_path} after reading #{@current_line} lines"
60
+ end
61
+ @current_line = 0
62
+ @current_path = @paths.shift or return nil
63
+ log "Switching to #{@current_path}"
64
+ file = IO.popen(shell_command(@current_path))
65
+ file.eof? ? next_available_file : file
66
+ end
67
+
68
+ # Use whitelist to be sure. Observation filters is applied after anyways.
69
+ SHOP_AND_ACCOUNT_ID_FILTER = /^[a-zA-Z0-9._]+$/
70
+
71
+ def shell_command (path)
72
+ if path =~ /\.gz/
73
+ command = "gunzip -c #{path.gsub(' ','\\ ')}"
74
+ else
75
+ command = "cat #{path.gsub(' ','\\ ')}"
76
+ end
77
+ if @filters[:shop_id] && @filters[:shop_id] =~ SHOP_AND_ACCOUNT_ID_FILTER
78
+ command += " | grep -F sid=#{@filters[:shop_id]}"
79
+ end
80
+ if @filters[:account_id] && @filters[:account_id] =~ SHOP_AND_ACCOUNT_ID_FILTER
81
+ command += " | grep -F aid=#{@filters[:account_id]}"
82
+ end
83
+ if @filters[:observation_type]
84
+ command += " | grep -F o=#{@filters[:observation_type]}"
85
+ end
86
+ command
87
+ end
88
+
89
+ end
90
+ end
@@ -0,0 +1,113 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'uri'
3
+ require 'observation'
4
+ require 'logbox'
5
+
6
+ module StreamWrapper
7
+ class ObservationFilter
8
+ include Enumerable
9
+ attr_reader :observation_count
10
+
11
+ # Support both strings and streams as input.
12
+ def initialize(input, filter_options)
13
+ input = StringIO.new(input) if input.class == String
14
+ @stream = input
15
+ # Quick check of options. It would be easy to set :valid => false but it is not allowed.
16
+ if filter_options.detect { |key, value| value == false }
17
+ raise "A filter option can not have false as value. Use skip instead."
18
+ end
19
+ @filter_options = filter_options
20
+ build_filters @filter_options
21
+ end
22
+
23
+ # Get next row and remove invalid utf-8 byte sequences.
24
+ def gets
25
+ while row = @stream.gets
26
+ row = Logbox::StringEncoder.iconv(row)
27
+ break if keep? row
28
+ end
29
+ row
30
+ end
31
+
32
+ def eof?
33
+ @stream.eof?
34
+ end
35
+
36
+ # Enumerate over Observations
37
+ def each
38
+ @observation_count = 0
39
+ while(row = gets)
40
+ begin
41
+ attrs = LogParser.parse_line(row) || next
42
+ rescue LogParser::ParseError => e
43
+ $stderr.puts("#{e} for row:#{row}")
44
+ next
45
+ end
46
+ observation = Observation.new(attrs)
47
+ if (@filter_options[:valid] && observation.valid?) ||
48
+ (@filter_options[:skip_valid] && !observation.valid?) ||
49
+ (@filter_options[:valid].nil? && @filter_options[:skip_valid].nil?)
50
+ yield observation
51
+ @observation_count += 1
52
+ end
53
+ end
54
+ end
55
+
56
+ protected
57
+
58
+ def build_filters options
59
+ @filters = []
60
+ options.keys.each do |full_key|
61
+ # Filter keys can be of format :account_id or :skip_account_id.
62
+ key_fragments = /^(skip_)?(.*)$/.match(full_key.to_s)
63
+ key = key_fragments[2].to_sym
64
+ skip = (key_fragments[1] == "skip_")
65
+
66
+ if options[full_key] != true
67
+ # Expressions are url escaped in log file. Encode same characters as
68
+ # JavaScript (as far as we can know).
69
+ # See https://developer.mozilla.org/en/Core_JavaScript_1.5_Reference/Global_Functions/encodeURIComponent
70
+ expression = URI.escape(options[full_key].to_s, /[^-_.!~*'()a-zA-Z\d]/n)
71
+
72
+ if key == :account_id
73
+ @filters << [skip, Regexp.new(Regexp.escape("aid=#{expression}"), true)]
74
+ end
75
+
76
+ if key == :shop_id
77
+ @filters << [skip, Regexp.new(Regexp.escape("sid=#{expression}"), true)]
78
+ end
79
+
80
+ if key == :observation_type
81
+ @filters << [skip, Regexp.new(Regexp.escape("o=#{expression}"), true)]
82
+ end
83
+ end
84
+
85
+ if key == :debug_observations && options[full_key] == true
86
+ if skip
87
+ # When SKIPPING tests, we can have SEVERAL regular expressions.
88
+ # Watch out for debug flag, but also known ips.
89
+ @filters << [skip, /^81\.231\.246\.117/] # ICU Intellegence
90
+ @filters << [skip, /^84\.218\.38\.122/] # Nils Svangård
91
+ @filters << [skip, /d=debug/]
92
+ else
93
+ @filters << [skip, /d=debug/]
94
+ end
95
+ end
96
+ end
97
+ end
98
+
99
+ def keep? row
100
+ # All filters must match to accept row.
101
+ @filters.each do |filter|
102
+ skip, regexp = filter
103
+ if skip
104
+ return false if row =~ regexp
105
+ else
106
+ return false unless row =~ regexp
107
+ end
108
+ end
109
+ true
110
+ end
111
+ end
112
+ end
113
+
@@ -0,0 +1,96 @@
1
+ # -*- encoding: utf-8 -*-
2
+ module StreamWrapper
3
+
4
+ # Modfies and injects buy_basket blob information into stream.
5
+ # 1. Inserts attributes from buy_basket blob into the buy_basket observation.
6
+ # 2. Adds a buy_item observation for each item in the basket.
7
+ class OrderBlobSplitter
8
+
9
+ def initialize (input)
10
+ input = StringIO.new(input) if input.class == String # Used for testing.
11
+ @stream = input
12
+ @buy_item_lines = []
13
+ end
14
+
15
+ def gets
16
+ if @buy_item_lines.size > 0
17
+ return @buy_item_lines.shift
18
+ end
19
+
20
+ line = @stream.gets
21
+ if line && line.include?("&o=buy_basket&")
22
+ extract_buy_items(line)
23
+ line = reformat_buy_basket(line)
24
+ end
25
+ line
26
+ end
27
+
28
+ def eof?
29
+ @stream.eof?
30
+ end
31
+
32
+ private
33
+
34
+ # Blob format:
35
+ # RW:T|2142||1079.00|215.80|0.00|strömstad||Sweden|
36
+ # RW:I|2142|MATAW09 Wanni L|Overall (98)||1079.00|1|
37
+ #
38
+ # See also http://www.google.com/support/googleanalytics/bin/answer.py?hl=en&answer=55528
39
+
40
+ def reformat_buy_basket(line)
41
+ blob = extract_order_blob(line)
42
+ order_blob = blob.match(/^RW:T\|(([^\|]*\|[^\|]*){7})\|/)[1]
43
+ values = order_blob.split("|")
44
+ attributes = {}
45
+ [:order_id, :affiliation, :total, :tax, :shipping, :city, :state, :country].each_with_index do |name, index|
46
+ attributes[name] = values[index]
47
+ end
48
+ insert_user_attributes(line, attributes)
49
+ rescue
50
+ line
51
+ end
52
+
53
+ def extract_buy_items(line)
54
+ blob = extract_order_blob(line)
55
+ blob.scan(/RW:I\|(([^\|]*\|[^\|]*){5})\|/) do |item_blob|
56
+ values = item_blob[0].split("|")
57
+ attributes = {}
58
+ [:order_id, :sku, :title, :category, :current_price, :quantity].each_with_index do |name, index|
59
+ attributes[name] = values[index]
60
+ end
61
+ @buy_item_lines << insert_user_attributes(line, attributes, :replace).gsub("&o=buy_basket", "&o=buy_item")
62
+ end
63
+ end
64
+
65
+ def extract_order_blob(line)
66
+ CGI::unescape(line.match(/_order_blob=([^&]+)&/)[1]) rescue ""
67
+ end
68
+
69
+ # 81.225.100.141 - - [23/Nov/2009:06:45:23 +0100] "GET /log.gif?_order_blob=RW%3AT%7C2142%7C%7C1079.00%7C215.80%7C0.00%7Cstr%C3%B6mstad%7C%7CSweden%7C%20%0ARW%3AI%7C2142%7CMATAW09%20Wanni%20L%7COverall%20(98)%7C%7C1079.00%7C1%7C&_order_nr=002142&a=Mozilla%2F4.0%20(compatible%3B%20MSIE%208.0%3B%20Windows%20NT%205.1%3B%20Trident%2F4.0%3B%20Mozilla%2F4.0%20(compatible%3B%20MSIE%206.0%3B%20Windows%20NT%205.1%3B%20SV1)%20%3B%20.NET%20CLR%202.0.50727%3B%20.NET%20CLR%203.0.4506.2152%3B%20.NET%20CLR%203.5.30729)&aid=jetshop&l=sv&n=microsoft%20internet%20explorer&o=buy_basket&p=win32&s=1024x600&sid=www.pixiekids.se&t=Unika%20barnkl%C3%A4der%20%26%20babykl%C3%A4der%20hos%20Pixiekids%20%E2%8E%AE%200-10%20%C3%A5r&u=https%3A%2F%2Fwww.pixiekids.se%2FOrderDetailsConfirmed.aspx&uid=1258954551578656003&x=87544&z=-60& HTTP/1.1" 200 35 "https://www.pixiekids.se/OrderDetailsConfirmed.aspx" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)"
70
+ def insert_user_attributes(line, attributes, method = :merge)
71
+ encoded_attributes = []
72
+ attributes_regexp = /\?(.*?)&(?!_)/
73
+ # Add existing attributes from line.
74
+ unless method == :replace
75
+ encoded_attributes = line.match(attributes_regexp)[1].split("&").map { |pair| pair.split("=") }
76
+ end
77
+ # Add in attributes from method argument.
78
+ attributes.each do |key, value|
79
+ next if value.nil? || value == ""
80
+ encoded_key = CGI.escape("_#{key}")
81
+ encoded_value = CGI.escape(value)
82
+ encoded_attributes.reject!{ |k, v| k == encoded_key }
83
+ encoded_attributes << [encoded_key, encoded_value]
84
+ end
85
+ # Reconstruct the user attributes part of the line.
86
+ attributes_string = encoded_attributes.sort.map do |key, value|
87
+ "#{key}=#{value}&"
88
+ end.join
89
+ # Replace the old user attributes in the line with the new ones.
90
+ line.sub(attributes_regexp, "?#{attributes_string}")
91
+ end
92
+
93
+ end
94
+ end
95
+
96
+