logbox 0.2.10

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. data/.bundle/config +3 -0
  2. data/.rvmrc +2 -0
  3. data/Gemfile +17 -0
  4. data/Gemfile.lock +30 -0
  5. data/README +14 -0
  6. data/Rakefile +74 -0
  7. data/VERSION +1 -0
  8. data/bin/download_logs +20 -0
  9. data/bin/obsstats +39 -0
  10. data/bin/rotate +17 -0
  11. data/bin/viewobs +198 -0
  12. data/lib/logbox.rb +9 -0
  13. data/lib/logbox/ansi_colors.rb +28 -0
  14. data/lib/logbox/log_parser.rb +79 -0
  15. data/lib/logbox/mockup_log.rb +44 -0
  16. data/lib/logbox/observation.rb +162 -0
  17. data/lib/logbox/observation_compiler.rb +311 -0
  18. data/lib/logbox/observation_mover.rb +142 -0
  19. data/lib/logbox/stream_wrapper.rb +20 -0
  20. data/lib/logbox/stream_wrapper/gzip_multi_file.rb +90 -0
  21. data/lib/logbox/stream_wrapper/observation_filter.rb +113 -0
  22. data/lib/logbox/stream_wrapper/order_blob_splitter.rb +96 -0
  23. data/lib/setup_environment.rb +15 -0
  24. data/logbox.gemspec +110 -0
  25. data/test/bin_viewobs_test.rb +42 -0
  26. data/test/fixtures/aws_keys_yaml.txt +3 -0
  27. data/test/fixtures/double-obs.log +1 -0
  28. data/test/fixtures/error_line.log +1 -0
  29. data/test/fixtures/log-for-md5.log +1 -0
  30. data/test/fixtures/log0.log +0 -0
  31. data/test/fixtures/log1.log +1 -0
  32. data/test/fixtures/log1.log.gz +0 -0
  33. data/test/fixtures/log2.log +2 -0
  34. data/test/fixtures/log2.log.gz +0 -0
  35. data/test/fixtures/log_invalid_mixed_encoding.log +1 -0
  36. data/test/fixtures/observation_filter.log +5 -0
  37. data/test/fixtures/unquoted_ugliness.log +2 -0
  38. data/test/log_parser_test.rb +84 -0
  39. data/test/observation_compiler_test.rb +216 -0
  40. data/test/observation_mover_test.rb +135 -0
  41. data/test/observation_test.rb +114 -0
  42. data/test/stream_wrapper/gzip_multi_file_test.rb +147 -0
  43. data/test/stream_wrapper/observation_filter_test.rb +171 -0
  44. data/test/stream_wrapper/order_blob_splitter_test.rb +129 -0
  45. data/test/test_helper.rb +23 -0
  46. metadata +177 -0
@@ -0,0 +1,79 @@
1
+ require 'date'
2
+ require 'cgi'
3
+ require 'stringio'
4
+
5
+ # Parses a standard web server log file stream and returns a hash with
6
+ # key/values for each line. Includes the Enumerable interface.
7
+ class LogParser
8
+ include Enumerable
9
+
10
+ # Support both strings and streams as input.
11
+ def initialize(input)
12
+ input = StringIO.new(input) if input.class == String
13
+ @stream = input
14
+ end
15
+
16
+ # Enumerable interface.
17
+ def each
18
+ while(observation = get_next_observation)
19
+ yield observation
20
+ end
21
+ end
22
+
23
+ def get_next_observation
24
+ line = @stream.gets
25
+ line && LogParser.parse_line(line)
26
+ end
27
+
28
+ LOG_FORMAT = /([^ ]*) [^ ]* [^ ]* \[([^\]]*)\] "([^"]*)" ([^ ]*)/
29
+ LOG_DATE_FORMAT = "%d/%b/%Y:%H:%M:%S %z"
30
+ LOG_KEY_VALUE_FORMAT = /[?&]([^=]+)=([^&]+)/
31
+ SERVER_ATTRIBUTES = [:ip, :timestamp, :request, :status]
32
+
33
+ # Parse one log line and return a hash with all attributes.
34
+ def self.parse_line(line)
35
+ return nil if line.strip.empty?
36
+
37
+ line =~ LOG_FORMAT
38
+ result = {}
39
+
40
+ # Save ip, timestamp and request.
41
+ result[:ip] = $1
42
+ begin
43
+ result[:timestamp] = DateTime.strptime($2, LOG_DATE_FORMAT)
44
+ rescue ArgumentError
45
+ raise ParseError.new("Error while parsing timestamp")
46
+ end
47
+ result[:request] = $3
48
+ result[:status] = $4
49
+
50
+ # Extract key/values pairs from the query part of the request.
51
+ $3.scan(LOG_KEY_VALUE_FORMAT) do |key, value|
52
+ begin
53
+ key = CGI.unescape(key).to_sym
54
+ value = CGI.unescape(value)
55
+ rescue Encoding::CompatibilityError => e
56
+ raise ParseError.new("Error while parsing query parameters")
57
+ end
58
+
59
+ if result.has_key? key
60
+ if result[key].is_a? Array
61
+ result[key] << value
62
+ else
63
+ result[key] = [result[key], value]
64
+ end
65
+ else
66
+ result[key] = value
67
+ end
68
+ end
69
+
70
+ return result
71
+ rescue ParseError
72
+ raise
73
+ rescue
74
+ raise ParseError.new("Unknown parsing error")
75
+ end
76
+ class ParseError < StandardError ; end
77
+ end
78
+
79
+
@@ -0,0 +1,44 @@
1
+ require 'time'
2
+ require 'uri'
3
+
4
+ module MockupLog
5
+
6
+ # Create a custom log file string.
7
+ def mockup_log lines_data
8
+ template_lines = {
9
+ :visit_page => %Q'72.211.248.18 - - [02/Mar/2009:06:58:12 +0100] "GET /log.gif?a=Mozilla%2F5.0%20(Macintosh%3B%20U%3B%20Intel%20Mac%20OS%20X%2010_5_5%3B%20en-us)%20AppleWebKit%2F525.27.1%20(KHTML%2C%20like%20Gecko)%20Version%2F3.2.1%20Safari%2F525.27.1&aid=xroads&l=en-us&n=netscape&o=visit_page&p=macintel&r=http%3A%2F%2Fdesignhousestockholm.xroads.se%2FPortal%2FProducts%2FProductGroup.asp%3Fclick%3D1%26pageIndex%3D14%26type%3Dgeneric&s=1920x1200&sid=DHS_Shop&t=Product&u=http%3A%2F%2Fdesignhousestockholm.xroads.se%2FPortal%2FProducts%2FProduct.asp%3FitemId%3D6005&uid=1235973461091911420&x=35412&z=480& HTTP/1.1" 200 35 "http://designhousestockholm.xroads.se/Portal/Products/Product.asp?itemId=6005" "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_5; en-us) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1" "-" "-"\n',
10
+ :view_item => %Q'72.211.248.18 - - [02/Mar/2009:06:59:32 +0100] "GET /log.gif?_current_price=US%2471&_image=..%2F..%2FArchive%2FImages%2FWebshop%2FProducts%2FDesignhouseStockholm%2F1132-1000_stor.jpg&_item_id=6005&_sku=HHAW09%20Shelter%20Pants&_thumbnail=..%2F..%2FArchive%2FImages%2FWebshop%2FProducts%2FDesignhouseStockholm%2F1132-1000_liten.jpg&_title=%C2%A0PLEECE%20-%20HOOD&_url=http%3A%2F%2Fdesignhousestockholm.xroads.se%2FPortal%2FProducts%2FProduct.asp%3FitemId%3D6005&a=Mozilla%2F5.0%20(Macintosh%3B%20U%3B%20Intel%20Mac%20OS%20X%2010_5_5%3B%20en-us)%20AppleWebKit%2F525.27.1%20(KHTML%2C%20like%20Gecko)%20Version%2F3.2.1%20Safari%2F525.27.1&aid=xroads&l=en-us&n=netscape&o=view_item&p=macintel&r=http%3A%2F%2Fdesignhousestockholm.xroads.se%2FPortal%2FProducts%2FProductGroup.asp%3Fclick%3D1%26pageIndex%3D14%26type%3Dgeneric&s=1920x1200&sid=DHS_Shop&t=Product&u=http%3A%2F%2Fdesignhousestockholm.xroads.se%2FPortal%2FProducts%2FProduct.asp%3FitemId%3D6005&uid=1235973461091911420&x=35412&z=480& HTTP/1.1" 200 35 "http://designhousestockholm.xroads.se/Portal/Products/Product.asp?itemId=6005" "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_5; en-us) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1" "-" "-"\n',
11
+ :pick_item => %Q'72.211.248.18 - - [02/Mar/2009:07:00:28 +0100] "GET /log.gif?_item_id=6004&a=Mozilla%2F5.0%20(Macintosh%3B%20U%3B%20Intel%20Mac%20OS%20X%2010_5_5%3B%20en-us)%20AppleWebKit%2F525.27.1%20(KHTML%2C%20like%20Gecko)%20Version%2F3.2.1%20Safari%2F525.27.1&aid=xroads&e=click&l=en-us&n=netscape&o=pick_item&p=macintel&r=http%3A%2F%2Fdesignhousestockholm.xroads.se%2FPortal%2FProducts%2FProductGroup.asp%3Fclick%3D1%26pageIndex%3D14%26type%3Dgeneric&s=1920x1200&sid=DHS_Shop&t=Product&u=http%3A%2F%2Fdesignhousestockholm.xroads.se%2FPortal%2FProducts%2FProduct.asp%3FitemId%3D6004&uid=1235973461091911420&x=98033&z=480& HTTP/1.1" 200 35 "http://designhousestockholm.xroads.se/Portal/Products/Product.asp?itemId=6004" "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_5; en-us) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1" "-" "-"\n',
12
+ :buy_item => %Q'72.211.248.18 - - [02/Mar/2009:07:01:28 +0100] "GET /log.gif?_current_price=1250&_order_id=13945&_quantity=1&_sku=FERTILITETSMONITOR&_title=Fertilitetsmonitor+fr%C3%A5n+Clearblue&a=Mozilla%2F4.0%20(compatible%3B%20MSIE%207.0%3B%20Windows%20NT%205.1%3B%20.NET%20CLR%201.1.4322%3B%20.NET%20CLR%202.0.50727%3B%20.NET%20CLR%203.0.4506.2152%3B%20.NET%20CLR%203.5.30729)&aid=jetshop&l=sv&n=microsoft%20internet%20explorer&o=buy_item&p=win32&s=1280x960&sid=www.medistore.se&t=H%C3%A4r%20hittar%20du%20sjukv%C3%A5rdsprodukter%2C%20blodtrycksm%C3%A4tare%2C%20f%C3%B6rsta%20hj%C3%A4lpen%20och%20pulsklockor&u=https%3A%2F%2Fwww.medistore.se%2FOrderDetailsConfirmed.aspx&uid=1268173983791130948&x=68410&z=-60& HTTP/1.1" 200 35 "http://designhousestockholm.xroads.se/Portal/Products/Product.asp?itemId=6004" "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_5; en-us) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1" "-" "-"\n',
13
+ :heed_recommendation => %Q'72.211.248.18 - - [02/Mar/2009:07:01:28 +0100] "GET /log.gif?_current_price=189%2C00%20kr&_item_id=1441&_rec_for=2407&_rec_type=viewed_this_also_viewed&_recommended_ids%5B0%5D=1441&_recommended_ids%5B1%5D=960&_recommended_ids%5B2%5D=535&_url=http%3A%2F%2Fwww.shirtstore.se%2Fits-on-like-donkey-kong-p-1441.aspx&a=Mozilla%2F4.0%20(compatible%3B%20MSIE%208.0%3B%20Windows%20NT%206.1%3B%20Trident%2F4.0%3B%20GTB6.4%3B%20SLCC2%3B%20.NET%20CLR%202.0.50727%3B%20.NET%20CLR%203.5.30729%3B%20.NET%20CLR%203.0.30729%3B%20Media%20Center%20PC%206.0)&aid=jetshop&e=click&l=sv&n=microsoft%20internet%20explorer&o=heed_recommendation&p=win32&r=http%3A%2F%2Fwww.shirtstore.se%2Ftshirts-c-491-1.aspx&s=1280x800&sid=www.shirtstore.se&t=The%20Hulk%20Distressed&u=http%3A%2F%2Fwww.shirtstore.se%2Fthe-hulk-distressed-p-2407-c-491.aspx&uid=1266087502030697666&x=55724&z=-60& HTTP/1.1" 200 35 "http://designhousestockholm.xroads.se/Portal/Products/Product.asp?itemId=6004" "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_5; en-us) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1" "-" "-"\n',
14
+ }
15
+ log = ""
16
+ lines_data.each do |line_data|
17
+ line_data = {:aid => "xroads", :sid => "Fortum", :uid => "12345", :timestamp => Time.parse("2009-03-02 07:01:28")}.merge(line_data)
18
+ observation_type = line_data.delete(:o) || raise("Missing observation type attribute (:o => 'xx') in line data.")
19
+ raise "Has no template for this observation type" unless template_lines[observation_type.to_sym]
20
+ # Construct the log line.
21
+ line = template_lines[observation_type.to_sym].dup
22
+ line_data.each do |key, value|
23
+ case key
24
+ when :timestamp then
25
+ line.sub!(/\[[^\]]+?\]/, value.nil? ? '' : value.strftime('[%d/%b/%Y:%H:%M:%S +0000]'))
26
+ when :ip then
27
+ line.sub!(/^\d+\.\d+\.\d+\.\d+/, value)
28
+ else
29
+ value = URI.escape(value, /[^-_.!~*'()a-zA-Z\d]/n)
30
+ line.gsub!(Regexp.new("#{key.to_s}=[^&]+&", true), "#{key.to_s}=#{value}&")
31
+ end
32
+ end
33
+ log << line
34
+ end
35
+ log
36
+ end
37
+
38
+ def mockup_log_file(path, lines_data)
39
+ File.open(path, "w") do |file|
40
+ file.write(mockup_log(lines_data))
41
+ end
42
+ end
43
+
44
+ end
@@ -0,0 +1,162 @@
1
+ require 'uri'
2
+ require 'log_parser'
3
+
4
+ class Observation
5
+
6
+ BASIC_ATTRIBUTES_NAMES = {
7
+ :aid => :account_id,
8
+ :sid => :shop_id,
9
+ :uid => :user_id,
10
+ :u => :document_url,
11
+ :r => :referrer,
12
+ :t => :document_title,
13
+ :s => :screen_resolution,
14
+ :l => :browser_language,
15
+ :p => :platform,
16
+ :a => :user_agent,
17
+ :h => :history_count,
18
+ :n => :navigator_name,
19
+ :z => :time_zone_offset,
20
+ :x => :seed,
21
+ :o => :observation_type
22
+ }
23
+
24
+ OBSERVATION_ATTRIBUTES_NAMES = {
25
+ :_item_id => :item_id,
26
+ :_rec_for => :rec_for,
27
+ :_title => :title,
28
+ :_url => :url,
29
+ :_basket_url => :basket_url,
30
+ :_description => :description,
31
+ :_normal_price => :normal_price,
32
+ :_current_price => :current_price,
33
+ :_thumbnail => :thumbnail,
34
+ :_image => :image,
35
+ :_stock => :stock,
36
+ :_expires => :expires,
37
+ :_order_blob => :order_blob,
38
+ :_order_id => :order_id,
39
+ :_sku => :sku,
40
+ :_quantity => :quantity,
41
+ :_tags => :tags,
42
+ :_user1 => :user1,
43
+ :_user2 => :user2,
44
+ :_user3 => :user3,
45
+ :_user4 => :user4,
46
+ :_user5 => :user5,
47
+ :_culture => :culture,
48
+ :_currency => :currency,
49
+ :_group_ref => :group_ref,
50
+ :_with_vat => :with_vat
51
+ }
52
+ BASIC_ATTRIBUTES = BASIC_ATTRIBUTES_NAMES.values
53
+ OBSERVATION_ATTRIBUTES = OBSERVATION_ATTRIBUTES_NAMES.values
54
+ ITEM_ATTRIBUTES = OBSERVATION_ATTRIBUTES
55
+ OBSERVATION_TYPE_STRINGS = %w[visit_page view_item pick_item buy_basket buy_item view_recommendation heed_recommendation]
56
+ OBSERVATION_TYPES = OBSERVATION_TYPE_STRINGS.map { |e| e.to_sym }
57
+ # Konverterade med ett stort perl-uttryck:
58
+ # perl -pi -e 's/([&?])_id=/\1_item_id=/g; s/currentprice/current_price/g; s/normalprice/normal_price/g; s/basketurl/basket_url/g; s/([&?])_type=/\1o=/g; s/buyitem/pick_item/g; s/viewitem/view_item/g; s/searchclick/search_pick_item/g; s/sid=bwintest/aid=bwin/g; s/sid=/aid=crossroads&sid=/g; s/aid=bwin/aid=bwin&sid=bwintest/g;' observer.access.log.*
59
+
60
+ attr_reader :attributes, :unknown_attributes, :type, :errors
61
+
62
+ def initialize(logline_attributes)
63
+ transfer_attributes(logline_attributes.clone)
64
+ validate_type
65
+ ensure_correct_urls
66
+ end
67
+
68
+ def valid?
69
+ @errors.empty?
70
+ end
71
+
72
+ def [](key)
73
+ @attributes[key]
74
+ end
75
+
76
+ def view_item?
77
+ @type == :view_item
78
+ end
79
+
80
+ def pick_item?
81
+ @type == :pick_item
82
+ end
83
+
84
+ def buy_item?
85
+ @type == :buy_item
86
+ end
87
+
88
+ private
89
+
90
+ # Takes the input hash and moves all known attributes to the @attributes
91
+ # hash, renaming some of them. Unknown attributes are also saved.
92
+ def transfer_attributes(logline_attributes)
93
+ @attributes = {}
94
+
95
+ # Transfer basic attributes.
96
+ (BASIC_ATTRIBUTES_NAMES.to_a + OBSERVATION_ATTRIBUTES_NAMES.to_a).each do |key, new_key|
97
+ value = logline_attributes.delete(key)
98
+ @attributes[new_key] = value if value
99
+ end
100
+
101
+ # Transfer server attributes.
102
+ LogParser::SERVER_ATTRIBUTES.each do |key|
103
+ value = logline_attributes.delete(key)
104
+ @attributes[key] = value if value
105
+ end
106
+
107
+ @unknown_attributes = logline_attributes
108
+ end
109
+
110
+ # Ensure that URLs are absolute and don't contain the anchor part.
111
+ def ensure_correct_urls
112
+ if @attributes.has_key? :document_url
113
+ @errors ||= []
114
+ [:url, :thumbnail, :image, :basket_url].each do |key|
115
+ begin
116
+ if @attributes.has_key? key
117
+ # Ensure that the url is not encoded (it is ok to unencode a url that is not encoded).
118
+ url = URI.unescape(@attributes[key])
119
+ # But we store and handle all urls encoded (according to the RFC). URI.join also requires it.
120
+ url = URI.escape(url)
121
+ base_url = URI.escape(@attributes[:document_url])
122
+ # Ensure absoulte.
123
+ full_url = URI.join(base_url, url).to_s
124
+ # Ensure no debug anchor. Just to make sure that #debug urls do not reach Dumbo.
125
+ full_url.gsub!(/%23debug(on|off)?/, '')
126
+ @attributes[key] = full_url
127
+ end
128
+ rescue
129
+ @errors << "url_error_on_#{key}".to_sym
130
+ end
131
+
132
+ end
133
+ end
134
+ end
135
+
136
+ REQUIRED_ATTRIBUTES = [:user_id, :account_id, :shop_id, :item_id]
137
+ REQUIRED_ATTRIBUTES_FOR_TYPE = {
138
+ :buy_basket => [:user_id, :account_id, :shop_id],
139
+ :buy_item => [:user_id, :account_id, :shop_id, :sku],
140
+ :visit_page => [:user_id, :account_id, :shop_id],
141
+ :view_recommendation => [:user_id, :account_id, :shop_id, :rec_for],
142
+ :heed_recommendation => [:user_id, :account_id, :shop_id, :item_id]
143
+ }
144
+
145
+ def validate_type
146
+ @type = :unknown
147
+ @errors ||= []
148
+
149
+ # Only handle successfull requests.
150
+ @errors << :status_not_ok and return unless @attributes[:status] == "200"
151
+
152
+ # Only handle types we know of.
153
+ @errors << :unknown_type and return unless OBSERVATION_TYPE_STRINGS.include? @attributes[:observation_type]
154
+ @type = @attributes[:observation_type].to_sym
155
+
156
+ # For all valid types we require some attributes.
157
+ (REQUIRED_ATTRIBUTES_FOR_TYPE[@type] || REQUIRED_ATTRIBUTES).each do |a|
158
+ @errors << "missing_#{a}".to_sym unless @attributes[a] =~ /.+/
159
+ end
160
+ end
161
+
162
+ end
@@ -0,0 +1,311 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'rubygems'
3
+ require 'time'
4
+ require 'date'
5
+ require 'fileutils'
6
+ require 'right_aws'
7
+ require 'rake'
8
+ require 'yaml'
9
+ require 'logbox'
10
+
11
+ module ObservationCompiler
12
+
13
+ class Job
14
+
15
+ def initialize(options = {})
16
+ @raw_logs_bucket = options[:raw_logs_bucket] || "rwdata-logs"
17
+ @raw_logs_prefix = options[:raw_logs_prefix] || "observer-log-"
18
+ @processed_logs_path = options[:processed_logs_path] || "local_files"
19
+ temp_dir = File.exist?("/apps/smartass") ? "/apps/smartass/tmp/" : "/tmp/"
20
+ @working_path = options[:working_path] ||= "#{temp_dir}observation_compiler/#{Process.pid}"
21
+ end
22
+
23
+ def fetch_and_merge(raw_date_range)
24
+ # A raw log-file for a date may contain observations from the day before.
25
+ processed_date_range = (raw_date_range.first-1)..(raw_date_range.last)
26
+
27
+ create_working_folders
28
+ copy_processed_logs_to_working_folder(processed_date_range)
29
+ unzip_processed(processed_date_range)
30
+
31
+ raw_date_range.each do |date|
32
+ download_raw_logs_to_working_folder(date)
33
+ unzip_raw(date)
34
+ merge_raw_into_processed(date)
35
+ remove_raw_logs(date)
36
+ end
37
+
38
+ ensure
39
+ sort_processed(processed_date_range)
40
+ zip_processed(processed_date_range)
41
+ move_processed_back(processed_date_range)
42
+ remove_working_path
43
+ end
44
+
45
+ def create_working_folders
46
+ FileUtils.mkdir_p(raw_working_path)
47
+ FileUtils.mkdir_p(processed_working_path)
48
+ end
49
+
50
+ def raw_working_path
51
+ File.join(@working_path, "r")
52
+ end
53
+
54
+ def processed_working_path
55
+ File.join(@working_path, "p")
56
+ end
57
+
58
+ def processed_log_name(date)
59
+ "observer-log-#{date.strftime('%Y-%m-%d')}"
60
+ end
61
+
62
+ def raw_logs_prefix(date)
63
+ @raw_logs_prefix + date.strftime('%Y-%m-%d')
64
+ end
65
+
66
+ def raw_log_paths(date)
67
+ Dir.glob(File.join(raw_working_path, "#{raw_logs_prefix(date)}*")).sort
68
+ end
69
+
70
+ def remove_working_path
71
+ FileUtils.rm_r @working_path
72
+ end
73
+
74
+ def copy_processed_logs_to_working_folder(date_range)
75
+ date_range.each do |date|
76
+ name = processed_log_name(date) + ".gz"
77
+ source = File.join(@processed_logs_path, name)
78
+ destination = File.join(processed_working_path, name)
79
+ if File.exist? source
80
+ log "Copying #{name} to working folder"
81
+ FileUtils.copy(source, destination)
82
+ end
83
+ end
84
+ end
85
+
86
+ def download_raw_logs_to_working_folder(date)
87
+ s3 = RightAws::S3.new(*aws_keys)
88
+ bucket = s3.bucket(@raw_logs_bucket)
89
+ raise "Unknown bucket: #{@raw_logs_bucket}" if bucket.nil?
90
+
91
+ raw_logs = bucket.keys(:prefix => raw_logs_prefix(date))
92
+ raw_logs.each do |raw_log|
93
+ log "Getting #{raw_log.name}"
94
+ File.open(File.join(raw_working_path, raw_log.name), "w") do |file|
95
+ s3.interface.get(@raw_logs_bucket, raw_log.name) do |chunk|
96
+ file.write(chunk)
97
+ end
98
+ end
99
+ end
100
+ end
101
+
102
+ def unzip_raw(date)
103
+ log "Unzipping raw logs for #{date}"
104
+ raw_log_paths(date).each do |raw_log|
105
+ system "gunzip #{raw_log}" if raw_log.end_with?(".gz")
106
+ end
107
+ end
108
+
109
+ def merge_raw_into_processed(date)
110
+ start_time = Time.now
111
+ count = 0
112
+ out_files = {}
113
+ raw_log_paths(date).each do |raw_log|
114
+ if raw_log_already_processed?(raw_log)
115
+ log "Skipping #{raw_log}"
116
+ next
117
+ else
118
+ log "Processing #{raw_log}"
119
+ end
120
+ File.foreach raw_log do |line|
121
+ log_line = LogLine.new(line)
122
+ next unless log_line.valid?
123
+ date = log_line.date
124
+ name = File.join(processed_working_path, processed_log_name(date))
125
+ out_files[name] ||= File.open(name, "a")
126
+ out_files[name] << log_line.normalize
127
+ count += 1
128
+ end
129
+ end
130
+ ensure
131
+ out_files.each_value { |file| file.close }
132
+ log "#{count} rader på #{(Time.now - start_time).to_f}s (#{count/(Time.now - start_time).to_f} rader/s)"
133
+ end
134
+
135
+ def raw_log_already_processed?(log_file_name)
136
+ # Look for the last observation to see if it is already processed.
137
+ last_observation = `tail -n 1 #{log_file_name}`
138
+ log_line = LogLine.new(last_observation)
139
+ return false unless log_line.valid?
140
+ date = log_line.date
141
+ processed_file_name = File.join(processed_working_path, processed_log_name(date))
142
+ File.exist?(processed_file_name) && system("grep", "-qF", last_observation, processed_file_name.chomp)
143
+ end
144
+
145
+ def remove_raw_logs(date)
146
+ log "Removing raw logs for #{date}"
147
+ raw_log_paths(date).each do |raw_log|
148
+ FileUtils.rm(raw_log)
149
+ end
150
+ end
151
+
152
+ def sort_processed(date_range)
153
+ date_range.each do |date|
154
+ name = processed_log_name(date)
155
+ Dir.chdir processed_working_path do
156
+ next unless File.exist?(name)
157
+ log "Sorting #{name}"
158
+ ENV['LC_ALL'] = 'C'
159
+ ok = system "sort -t: -k2,4 #{name} | uniq > #{name}.sorted"
160
+ raise "Sort error!" unless ok
161
+ File.rename("#{name}.sorted", name)
162
+ end
163
+ end
164
+ end
165
+
166
+ def zip_processed(date_range)
167
+ log "Zipping processed files"
168
+ date_range.each do |date|
169
+ name = processed_log_name(date)
170
+ file = File.join(processed_working_path, name)
171
+ system "gzip #{file}" if File.exist? file
172
+ end
173
+ end
174
+
175
+ def unzip_processed(date_range)
176
+ log "Unzipping processed files"
177
+ date_range.each do |date|
178
+ name = processed_log_name(date) + ".gz"
179
+ file = File.join(processed_working_path, name)
180
+ system "gunzip #{file}" if File.exist? file
181
+ end
182
+ end
183
+
184
+ def move_processed_back(date_range)
185
+ date_range.each do |date|
186
+ name = processed_log_name(date) + ".gz"
187
+ source = File.join(processed_working_path, name)
188
+ destination = File.join(@processed_logs_path, name)
189
+ if File.exist? source
190
+ log "Moving #{name} back"
191
+ FileUtils.move(source, destination)
192
+ end
193
+ end
194
+ end
195
+
196
+ def log msg
197
+ unless defined?(TEST_RUN)
198
+ puts msg
199
+ end
200
+ end
201
+
202
+ DEFAULT_KEY_FILE = '/etc/s3_key.yml'
203
+ def aws_keys
204
+ if File.exists? DEFAULT_KEY_FILE
205
+ hash = YAML.load_file(DEFAULT_KEY_FILE)
206
+ [hash[:access_key_id], hash[:secret_access_key]]
207
+ else
208
+ access_key_id = ENV['OBSENTER_S3_KEY']
209
+ secret_access_key = secret_access_key_from_keychain!(access_key_id)
210
+ [access_key_id, secret_access_key]
211
+ end
212
+ end
213
+
214
+ # These two methods are borrowed from Awsborn
215
+ def secret_access_key_from_keychain! (key_id)
216
+ secret = secret_access_key_from_keychain key_id
217
+ raise "Could not find secret access key for #{key_id}" if secret.to_s == ''
218
+ secret
219
+ end
220
+
221
+ def secret_access_key_from_keychain (key_id)
222
+ @credentials ||= {}
223
+ unless @credentials[key_id]
224
+ dump = `security -q find-generic-password -a "#{key_id}" -g 2>&1`
225
+ secret_key = dump[/password: "(.*)"/, 1]
226
+ @credentials[key_id] = secret_key
227
+ end
228
+ @credentials[key_id]
229
+ end
230
+ end
231
+
232
+ class LogLine
233
+
234
+ def initialize(line)
235
+ @line = Logbox::StringEncoder.iconv(line)
236
+ end
237
+
238
+ def valid?
239
+ normalize
240
+ true
241
+ rescue
242
+ false
243
+ end
244
+
245
+ def normalize
246
+ normalize_s3_format
247
+ normalize_apache_format
248
+ normalize_timestamp
249
+ @line
250
+ end
251
+
252
+ TIMESTAMP_MATCHER = /(\d+)\/(\w+)\/(\d+):(\d+):(\d+):(\d+)\s([-+]?\d{2})/
253
+
254
+ def timestamp
255
+ unless @timestamp
256
+ match = @line.match(TIMESTAMP_MATCHER)
257
+ @timestamp = Time.utc(match[3], match[2], match[1], match[4], match[5], match[6])
258
+ @timestamp -= match[7].to_i * 3600 # Correct the zone. Works only on whole hours timezones.
259
+ end
260
+ @timestamp
261
+ end
262
+
263
+ def date
264
+ timestamp.send :to_date
265
+ end
266
+
267
+ def to_s
268
+ @line
269
+ end
270
+
271
+ private
272
+
273
+ S = '(\\S+)'
274
+ # Equivalent to /("(?:\\\\|\\"|[^\\"])*")/
275
+ Q = '("(?:\\\\\\\\|\\\\"|[^\\\\"])*")'
276
+ TIMESTAMP = '(\\[[^\\]]+\\])'
277
+
278
+ S3_FORMAT = Regexp.new('^' + [S,S,TIMESTAMP,S,S,S,S,S,Q,S,S,S,S,S,S,Q,Q,S].join(' '), 'm')
279
+ def normalize_s3_format
280
+ # %Q{owner bucket [16/Mar/2010:16:00:00 +0000] 85.225.221.221 requester requestID operation key "GET /log.gif?_item_id=987&_title=V%C3%A4skor%2FFodral&_url=http%3A%2F%2Fwww.24.se%2Fvaskorfodral-c-987-1.aspx%3Fsortorder%3D1%26direction%3D0%26defps%3D10%26pagesize%3D30%26pagenum%3D3%26useparams%3D0&a=Mozilla%2F5.0%20(Macintosh%3B%20U%3B%20Intel%20Mac%20OS%20X%2010.6%3B%20sv-SE%3B%20rv%3A1.9.2)%20Gecko%2F20100115%20Firefox%2F3.6&aid=jetshop&l=sv-se&n=netscape&o=view_tag&p=macintel&r=http%3A%2F%2Fwww.24.se%2Fvaskorfodral-c-987-1.aspx%3Fsortorder%3D1%26direction%3D0%26defps%3D10%26pagesize%3D30%26pagenum%3D2%26useparams%3D0&s=1280x800&sid=www.24.se&t=V%C3%A4skor%2FFodral&u=http%3A%2F%2Fwww.24.se%2Fvaskorfodral-c-987-1.aspx%3Fsortorder%3D1%26direction%3D0%26defps%3D10%26pagesize%3D30%26pagenum%3D3%26useparams%3D0&uid=1256057859704610385&x=32058&z=-60& HTTP/1.1" 200 - 35 35 6 5 "http://www.24.se/vaskorfodral-c-987-1.aspx?sortorder=1&direction=0&defps=10&pagesize=30&pagenum=3&useparams=0" "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; sv-SE; rv:1.9.2) Gecko/20100115 Firefox/3.6" -}
281
+ match = @line.match(S3_FORMAT)
282
+ if match
283
+ @line = %Q(#{match[4]} - - #{match[3]} #{match[9]} #{match[10]} #{match[12]} #{match[16]} #{match[17]} "-" "-"\n)
284
+ end
285
+ @line
286
+ end
287
+
288
+ APACHE_WITHOUT_COOKIES = Regexp.new('^' + [S,S,S,TIMESTAMP,Q,S,S,Q,Q].join(' ') + '$')
289
+ def normalize_apache_format
290
+ # Add third party cookies at end if they are not there.
291
+ # %Q{124.191.88.9 - - [26/May/2009:23:59:50 +0000] "GET /log.gif" "Mozilla/5.0"}
292
+ match = @line.match(APACHE_WITHOUT_COOKIES)
293
+ if match
294
+ @line = %Q(#{match[0]} "-" "-"\n)
295
+ end
296
+ @line
297
+ end
298
+
299
+ def normalize_timestamp
300
+ # 12/Apr/2010:09:07:23 +0200 => 12/Apr/2010:07:07:23 +0000
301
+ match = @line.match(/^(.*?\[)([^\]]+)(\].+)$/m)
302
+ unless match[2].end_with?('0000')
303
+ @line = "#{match[1]}#{timestamp.strftime('%d/%b/%Y:%H:%M:%S +0000')}#{match[3]}"
304
+ end
305
+ @line
306
+ end
307
+
308
+ end
309
+
310
+ end
311
+