logbox 0.2.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. data/.bundle/config +3 -0
  2. data/.rvmrc +2 -0
  3. data/Gemfile +17 -0
  4. data/Gemfile.lock +30 -0
  5. data/README +14 -0
  6. data/Rakefile +74 -0
  7. data/VERSION +1 -0
  8. data/bin/download_logs +20 -0
  9. data/bin/obsstats +39 -0
  10. data/bin/rotate +17 -0
  11. data/bin/viewobs +198 -0
  12. data/lib/logbox.rb +9 -0
  13. data/lib/logbox/ansi_colors.rb +28 -0
  14. data/lib/logbox/log_parser.rb +79 -0
  15. data/lib/logbox/mockup_log.rb +44 -0
  16. data/lib/logbox/observation.rb +162 -0
  17. data/lib/logbox/observation_compiler.rb +311 -0
  18. data/lib/logbox/observation_mover.rb +142 -0
  19. data/lib/logbox/stream_wrapper.rb +20 -0
  20. data/lib/logbox/stream_wrapper/gzip_multi_file.rb +90 -0
  21. data/lib/logbox/stream_wrapper/observation_filter.rb +113 -0
  22. data/lib/logbox/stream_wrapper/order_blob_splitter.rb +96 -0
  23. data/lib/setup_environment.rb +15 -0
  24. data/logbox.gemspec +110 -0
  25. data/test/bin_viewobs_test.rb +42 -0
  26. data/test/fixtures/aws_keys_yaml.txt +3 -0
  27. data/test/fixtures/double-obs.log +1 -0
  28. data/test/fixtures/error_line.log +1 -0
  29. data/test/fixtures/log-for-md5.log +1 -0
  30. data/test/fixtures/log0.log +0 -0
  31. data/test/fixtures/log1.log +1 -0
  32. data/test/fixtures/log1.log.gz +0 -0
  33. data/test/fixtures/log2.log +2 -0
  34. data/test/fixtures/log2.log.gz +0 -0
  35. data/test/fixtures/log_invalid_mixed_encoding.log +1 -0
  36. data/test/fixtures/observation_filter.log +5 -0
  37. data/test/fixtures/unquoted_ugliness.log +2 -0
  38. data/test/log_parser_test.rb +84 -0
  39. data/test/observation_compiler_test.rb +216 -0
  40. data/test/observation_mover_test.rb +135 -0
  41. data/test/observation_test.rb +114 -0
  42. data/test/stream_wrapper/gzip_multi_file_test.rb +147 -0
  43. data/test/stream_wrapper/observation_filter_test.rb +171 -0
  44. data/test/stream_wrapper/order_blob_splitter_test.rb +129 -0
  45. data/test/test_helper.rb +23 -0
  46. metadata +177 -0
@@ -0,0 +1,79 @@
1
+ require 'date'
2
+ require 'cgi'
3
+ require 'stringio'
4
+
5
+ # Parses a standard web server log file stream and returns a hash with
6
+ # key/values for each line. Includes the Enumerable interface.
7
+ class LogParser
8
+ include Enumerable
9
+
10
+ # Support both strings and streams as input.
11
+ def initialize(input)
12
+ input = StringIO.new(input) if input.class == String
13
+ @stream = input
14
+ end
15
+
16
+ # Enumerable interface.
17
+ def each
18
+ while(observation = get_next_observation)
19
+ yield observation
20
+ end
21
+ end
22
+
23
+ def get_next_observation
24
+ line = @stream.gets
25
+ line && LogParser.parse_line(line)
26
+ end
27
+
28
+ LOG_FORMAT = /([^ ]*) [^ ]* [^ ]* \[([^\]]*)\] "([^"]*)" ([^ ]*)/
29
+ LOG_DATE_FORMAT = "%d/%b/%Y:%H:%M:%S %z"
30
+ LOG_KEY_VALUE_FORMAT = /[?&]([^=]+)=([^&]+)/
31
+ SERVER_ATTRIBUTES = [:ip, :timestamp, :request, :status]
32
+
33
+ # Parse one log line and return a hash with all attributes.
34
+ def self.parse_line(line)
35
+ return nil if line.strip.empty?
36
+
37
+ line =~ LOG_FORMAT
38
+ result = {}
39
+
40
+ # Save ip, timestamp and request.
41
+ result[:ip] = $1
42
+ begin
43
+ result[:timestamp] = DateTime.strptime($2, LOG_DATE_FORMAT)
44
+ rescue ArgumentError
45
+ raise ParseError.new("Error while parsing timestamp")
46
+ end
47
+ result[:request] = $3
48
+ result[:status] = $4
49
+
50
+ # Extract key/values pairs from the query part of the request.
51
+ $3.scan(LOG_KEY_VALUE_FORMAT) do |key, value|
52
+ begin
53
+ key = CGI.unescape(key).to_sym
54
+ value = CGI.unescape(value)
55
+ rescue Encoding::CompatibilityError => e
56
+ raise ParseError.new("Error while parsing query parameters")
57
+ end
58
+
59
+ if result.has_key? key
60
+ if result[key].is_a? Array
61
+ result[key] << value
62
+ else
63
+ result[key] = [result[key], value]
64
+ end
65
+ else
66
+ result[key] = value
67
+ end
68
+ end
69
+
70
+ return result
71
+ rescue ParseError
72
+ raise
73
+ rescue
74
+ raise ParseError.new("Unknown parsing error")
75
+ end
76
+ class ParseError < StandardError ; end
77
+ end
78
+
79
+
@@ -0,0 +1,44 @@
1
+ require 'time'
2
+ require 'uri'
3
+
4
+ module MockupLog
5
+
6
+ # Create a custom log file string.
7
+ def mockup_log lines_data
8
+ template_lines = {
9
+ :visit_page => %Q'72.211.248.18 - - [02/Mar/2009:06:58:12 +0100] "GET /log.gif?a=Mozilla%2F5.0%20(Macintosh%3B%20U%3B%20Intel%20Mac%20OS%20X%2010_5_5%3B%20en-us)%20AppleWebKit%2F525.27.1%20(KHTML%2C%20like%20Gecko)%20Version%2F3.2.1%20Safari%2F525.27.1&aid=xroads&l=en-us&n=netscape&o=visit_page&p=macintel&r=http%3A%2F%2Fdesignhousestockholm.xroads.se%2FPortal%2FProducts%2FProductGroup.asp%3Fclick%3D1%26pageIndex%3D14%26type%3Dgeneric&s=1920x1200&sid=DHS_Shop&t=Product&u=http%3A%2F%2Fdesignhousestockholm.xroads.se%2FPortal%2FProducts%2FProduct.asp%3FitemId%3D6005&uid=1235973461091911420&x=35412&z=480& HTTP/1.1" 200 35 "http://designhousestockholm.xroads.se/Portal/Products/Product.asp?itemId=6005" "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_5; en-us) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1" "-" "-"\n',
10
+ :view_item => %Q'72.211.248.18 - - [02/Mar/2009:06:59:32 +0100] "GET /log.gif?_current_price=US%2471&_image=..%2F..%2FArchive%2FImages%2FWebshop%2FProducts%2FDesignhouseStockholm%2F1132-1000_stor.jpg&_item_id=6005&_sku=HHAW09%20Shelter%20Pants&_thumbnail=..%2F..%2FArchive%2FImages%2FWebshop%2FProducts%2FDesignhouseStockholm%2F1132-1000_liten.jpg&_title=%C2%A0PLEECE%20-%20HOOD&_url=http%3A%2F%2Fdesignhousestockholm.xroads.se%2FPortal%2FProducts%2FProduct.asp%3FitemId%3D6005&a=Mozilla%2F5.0%20(Macintosh%3B%20U%3B%20Intel%20Mac%20OS%20X%2010_5_5%3B%20en-us)%20AppleWebKit%2F525.27.1%20(KHTML%2C%20like%20Gecko)%20Version%2F3.2.1%20Safari%2F525.27.1&aid=xroads&l=en-us&n=netscape&o=view_item&p=macintel&r=http%3A%2F%2Fdesignhousestockholm.xroads.se%2FPortal%2FProducts%2FProductGroup.asp%3Fclick%3D1%26pageIndex%3D14%26type%3Dgeneric&s=1920x1200&sid=DHS_Shop&t=Product&u=http%3A%2F%2Fdesignhousestockholm.xroads.se%2FPortal%2FProducts%2FProduct.asp%3FitemId%3D6005&uid=1235973461091911420&x=35412&z=480& HTTP/1.1" 200 35 "http://designhousestockholm.xroads.se/Portal/Products/Product.asp?itemId=6005" "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_5; en-us) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1" "-" "-"\n',
11
+ :pick_item => %Q'72.211.248.18 - - [02/Mar/2009:07:00:28 +0100] "GET /log.gif?_item_id=6004&a=Mozilla%2F5.0%20(Macintosh%3B%20U%3B%20Intel%20Mac%20OS%20X%2010_5_5%3B%20en-us)%20AppleWebKit%2F525.27.1%20(KHTML%2C%20like%20Gecko)%20Version%2F3.2.1%20Safari%2F525.27.1&aid=xroads&e=click&l=en-us&n=netscape&o=pick_item&p=macintel&r=http%3A%2F%2Fdesignhousestockholm.xroads.se%2FPortal%2FProducts%2FProductGroup.asp%3Fclick%3D1%26pageIndex%3D14%26type%3Dgeneric&s=1920x1200&sid=DHS_Shop&t=Product&u=http%3A%2F%2Fdesignhousestockholm.xroads.se%2FPortal%2FProducts%2FProduct.asp%3FitemId%3D6004&uid=1235973461091911420&x=98033&z=480& HTTP/1.1" 200 35 "http://designhousestockholm.xroads.se/Portal/Products/Product.asp?itemId=6004" "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_5; en-us) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1" "-" "-"\n',
12
+ :buy_item => %Q'72.211.248.18 - - [02/Mar/2009:07:01:28 +0100] "GET /log.gif?_current_price=1250&_order_id=13945&_quantity=1&_sku=FERTILITETSMONITOR&_title=Fertilitetsmonitor+fr%C3%A5n+Clearblue&a=Mozilla%2F4.0%20(compatible%3B%20MSIE%207.0%3B%20Windows%20NT%205.1%3B%20.NET%20CLR%201.1.4322%3B%20.NET%20CLR%202.0.50727%3B%20.NET%20CLR%203.0.4506.2152%3B%20.NET%20CLR%203.5.30729)&aid=jetshop&l=sv&n=microsoft%20internet%20explorer&o=buy_item&p=win32&s=1280x960&sid=www.medistore.se&t=H%C3%A4r%20hittar%20du%20sjukv%C3%A5rdsprodukter%2C%20blodtrycksm%C3%A4tare%2C%20f%C3%B6rsta%20hj%C3%A4lpen%20och%20pulsklockor&u=https%3A%2F%2Fwww.medistore.se%2FOrderDetailsConfirmed.aspx&uid=1268173983791130948&x=68410&z=-60& HTTP/1.1" 200 35 "http://designhousestockholm.xroads.se/Portal/Products/Product.asp?itemId=6004" "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_5; en-us) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1" "-" "-"\n',
13
+ :heed_recommendation => %Q'72.211.248.18 - - [02/Mar/2009:07:01:28 +0100] "GET /log.gif?_current_price=189%2C00%20kr&_item_id=1441&_rec_for=2407&_rec_type=viewed_this_also_viewed&_recommended_ids%5B0%5D=1441&_recommended_ids%5B1%5D=960&_recommended_ids%5B2%5D=535&_url=http%3A%2F%2Fwww.shirtstore.se%2Fits-on-like-donkey-kong-p-1441.aspx&a=Mozilla%2F4.0%20(compatible%3B%20MSIE%208.0%3B%20Windows%20NT%206.1%3B%20Trident%2F4.0%3B%20GTB6.4%3B%20SLCC2%3B%20.NET%20CLR%202.0.50727%3B%20.NET%20CLR%203.5.30729%3B%20.NET%20CLR%203.0.30729%3B%20Media%20Center%20PC%206.0)&aid=jetshop&e=click&l=sv&n=microsoft%20internet%20explorer&o=heed_recommendation&p=win32&r=http%3A%2F%2Fwww.shirtstore.se%2Ftshirts-c-491-1.aspx&s=1280x800&sid=www.shirtstore.se&t=The%20Hulk%20Distressed&u=http%3A%2F%2Fwww.shirtstore.se%2Fthe-hulk-distressed-p-2407-c-491.aspx&uid=1266087502030697666&x=55724&z=-60& HTTP/1.1" 200 35 "http://designhousestockholm.xroads.se/Portal/Products/Product.asp?itemId=6004" "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_5; en-us) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1" "-" "-"\n',
14
+ }
15
+ log = ""
16
+ lines_data.each do |line_data|
17
+ line_data = {:aid => "xroads", :sid => "Fortum", :uid => "12345", :timestamp => Time.parse("2009-03-02 07:01:28")}.merge(line_data)
18
+ observation_type = line_data.delete(:o) || raise("Missing observation type attribute (:o => 'xx') in line data.")
19
+ raise "Has no template for this observation type" unless template_lines[observation_type.to_sym]
20
+ # Construct the log line.
21
+ line = template_lines[observation_type.to_sym].dup
22
+ line_data.each do |key, value|
23
+ case key
24
+ when :timestamp then
25
+ line.sub!(/\[[^\]]+?\]/, value.nil? ? '' : value.strftime('[%d/%b/%Y:%H:%M:%S +0000]'))
26
+ when :ip then
27
+ line.sub!(/^\d+\.\d+\.\d+\.\d+/, value)
28
+ else
29
+ value = URI.escape(value, /[^-_.!~*'()a-zA-Z\d]/n)
30
+ line.gsub!(Regexp.new("#{key.to_s}=[^&]+&", true), "#{key.to_s}=#{value}&")
31
+ end
32
+ end
33
+ log << line
34
+ end
35
+ log
36
+ end
37
+
38
+ def mockup_log_file(path, lines_data)
39
+ File.open(path, "w") do |file|
40
+ file.write(mockup_log(lines_data))
41
+ end
42
+ end
43
+
44
+ end
@@ -0,0 +1,162 @@
1
+ require 'uri'
2
+ require 'log_parser'
3
+
4
+ class Observation
5
+
6
+ BASIC_ATTRIBUTES_NAMES = {
7
+ :aid => :account_id,
8
+ :sid => :shop_id,
9
+ :uid => :user_id,
10
+ :u => :document_url,
11
+ :r => :referrer,
12
+ :t => :document_title,
13
+ :s => :screen_resolution,
14
+ :l => :browser_language,
15
+ :p => :platform,
16
+ :a => :user_agent,
17
+ :h => :history_count,
18
+ :n => :navigator_name,
19
+ :z => :time_zone_offset,
20
+ :x => :seed,
21
+ :o => :observation_type
22
+ }
23
+
24
+ OBSERVATION_ATTRIBUTES_NAMES = {
25
+ :_item_id => :item_id,
26
+ :_rec_for => :rec_for,
27
+ :_title => :title,
28
+ :_url => :url,
29
+ :_basket_url => :basket_url,
30
+ :_description => :description,
31
+ :_normal_price => :normal_price,
32
+ :_current_price => :current_price,
33
+ :_thumbnail => :thumbnail,
34
+ :_image => :image,
35
+ :_stock => :stock,
36
+ :_expires => :expires,
37
+ :_order_blob => :order_blob,
38
+ :_order_id => :order_id,
39
+ :_sku => :sku,
40
+ :_quantity => :quantity,
41
+ :_tags => :tags,
42
+ :_user1 => :user1,
43
+ :_user2 => :user2,
44
+ :_user3 => :user3,
45
+ :_user4 => :user4,
46
+ :_user5 => :user5,
47
+ :_culture => :culture,
48
+ :_currency => :currency,
49
+ :_group_ref => :group_ref,
50
+ :_with_vat => :with_vat
51
+ }
52
+ BASIC_ATTRIBUTES = BASIC_ATTRIBUTES_NAMES.values
53
+ OBSERVATION_ATTRIBUTES = OBSERVATION_ATTRIBUTES_NAMES.values
54
+ ITEM_ATTRIBUTES = OBSERVATION_ATTRIBUTES
55
+ OBSERVATION_TYPE_STRINGS = %w[visit_page view_item pick_item buy_basket buy_item view_recommendation heed_recommendation]
56
+ OBSERVATION_TYPES = OBSERVATION_TYPE_STRINGS.map { |e| e.to_sym }
57
+ # Konverterade med ett stort perl-uttryck:
58
+ # perl -pi -e 's/([&?])_id=/\1_item_id=/g; s/currentprice/current_price/g; s/normalprice/normal_price/g; s/basketurl/basket_url/g; s/([&?])_type=/\1o=/g; s/buyitem/pick_item/g; s/viewitem/view_item/g; s/searchclick/search_pick_item/g; s/sid=bwintest/aid=bwin/g; s/sid=/aid=crossroads&sid=/g; s/aid=bwin/aid=bwin&sid=bwintest/g;' observer.access.log.*
59
+
60
+ attr_reader :attributes, :unknown_attributes, :type, :errors
61
+
62
+ def initialize(logline_attributes)
63
+ transfer_attributes(logline_attributes.clone)
64
+ validate_type
65
+ ensure_correct_urls
66
+ end
67
+
68
+ def valid?
69
+ @errors.empty?
70
+ end
71
+
72
+ def [](key)
73
+ @attributes[key]
74
+ end
75
+
76
+ def view_item?
77
+ @type == :view_item
78
+ end
79
+
80
+ def pick_item?
81
+ @type == :pick_item
82
+ end
83
+
84
+ def buy_item?
85
+ @type == :buy_item
86
+ end
87
+
88
+ private
89
+
90
+ # Takes the input hash and moves all known attributes to the @attributes
91
+ # hash, renaming some of them. Unknown attributes are also saved.
92
+ def transfer_attributes(logline_attributes)
93
+ @attributes = {}
94
+
95
+ # Transfer basic attributes.
96
+ (BASIC_ATTRIBUTES_NAMES.to_a + OBSERVATION_ATTRIBUTES_NAMES.to_a).each do |key, new_key|
97
+ value = logline_attributes.delete(key)
98
+ @attributes[new_key] = value if value
99
+ end
100
+
101
+ # Transfer server attributes.
102
+ LogParser::SERVER_ATTRIBUTES.each do |key|
103
+ value = logline_attributes.delete(key)
104
+ @attributes[key] = value if value
105
+ end
106
+
107
+ @unknown_attributes = logline_attributes
108
+ end
109
+
110
+ # Ensure that URLs are absolute and don't contain the anchor part.
111
+ def ensure_correct_urls
112
+ if @attributes.has_key? :document_url
113
+ @errors ||= []
114
+ [:url, :thumbnail, :image, :basket_url].each do |key|
115
+ begin
116
+ if @attributes.has_key? key
117
+ # Ensure that the url is not encoded (it is ok to unencode a url that is not encoded).
118
+ url = URI.unescape(@attributes[key])
119
+ # But we store and handle all urls encoded (according to the RFC). URI.join also requires it.
120
+ url = URI.escape(url)
121
+ base_url = URI.escape(@attributes[:document_url])
122
+ # Ensure absoulte.
123
+ full_url = URI.join(base_url, url).to_s
124
+ # Ensure no debug anchor. Just to make sure that #debug urls do not reach Dumbo.
125
+ full_url.gsub!(/%23debug(on|off)?/, '')
126
+ @attributes[key] = full_url
127
+ end
128
+ rescue
129
+ @errors << "url_error_on_#{key}".to_sym
130
+ end
131
+
132
+ end
133
+ end
134
+ end
135
+
136
+ REQUIRED_ATTRIBUTES = [:user_id, :account_id, :shop_id, :item_id]
137
+ REQUIRED_ATTRIBUTES_FOR_TYPE = {
138
+ :buy_basket => [:user_id, :account_id, :shop_id],
139
+ :buy_item => [:user_id, :account_id, :shop_id, :sku],
140
+ :visit_page => [:user_id, :account_id, :shop_id],
141
+ :view_recommendation => [:user_id, :account_id, :shop_id, :rec_for],
142
+ :heed_recommendation => [:user_id, :account_id, :shop_id, :item_id]
143
+ }
144
+
145
+ def validate_type
146
+ @type = :unknown
147
+ @errors ||= []
148
+
149
+ # Only handle successfull requests.
150
+ @errors << :status_not_ok and return unless @attributes[:status] == "200"
151
+
152
+ # Only handle types we know of.
153
+ @errors << :unknown_type and return unless OBSERVATION_TYPE_STRINGS.include? @attributes[:observation_type]
154
+ @type = @attributes[:observation_type].to_sym
155
+
156
+ # For all valid types we require some attributes.
157
+ (REQUIRED_ATTRIBUTES_FOR_TYPE[@type] || REQUIRED_ATTRIBUTES).each do |a|
158
+ @errors << "missing_#{a}".to_sym unless @attributes[a] =~ /.+/
159
+ end
160
+ end
161
+
162
+ end
@@ -0,0 +1,311 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'rubygems'
3
+ require 'time'
4
+ require 'date'
5
+ require 'fileutils'
6
+ require 'right_aws'
7
+ require 'rake'
8
+ require 'yaml'
9
+ require 'logbox'
10
+
11
+ module ObservationCompiler
12
+
13
+ class Job
14
+
15
+ def initialize(options = {})
16
+ @raw_logs_bucket = options[:raw_logs_bucket] || "rwdata-logs"
17
+ @raw_logs_prefix = options[:raw_logs_prefix] || "observer-log-"
18
+ @processed_logs_path = options[:processed_logs_path] || "local_files"
19
+ temp_dir = File.exist?("/apps/smartass") ? "/apps/smartass/tmp/" : "/tmp/"
20
+ @working_path = options[:working_path] ||= "#{temp_dir}observation_compiler/#{Process.pid}"
21
+ end
22
+
23
+ def fetch_and_merge(raw_date_range)
24
+ # A raw log-file for a date may contain observations from the day before.
25
+ processed_date_range = (raw_date_range.first-1)..(raw_date_range.last)
26
+
27
+ create_working_folders
28
+ copy_processed_logs_to_working_folder(processed_date_range)
29
+ unzip_processed(processed_date_range)
30
+
31
+ raw_date_range.each do |date|
32
+ download_raw_logs_to_working_folder(date)
33
+ unzip_raw(date)
34
+ merge_raw_into_processed(date)
35
+ remove_raw_logs(date)
36
+ end
37
+
38
+ ensure
39
+ sort_processed(processed_date_range)
40
+ zip_processed(processed_date_range)
41
+ move_processed_back(processed_date_range)
42
+ remove_working_path
43
+ end
44
+
45
+ def create_working_folders
46
+ FileUtils.mkdir_p(raw_working_path)
47
+ FileUtils.mkdir_p(processed_working_path)
48
+ end
49
+
50
+ def raw_working_path
51
+ File.join(@working_path, "r")
52
+ end
53
+
54
+ def processed_working_path
55
+ File.join(@working_path, "p")
56
+ end
57
+
58
+ def processed_log_name(date)
59
+ "observer-log-#{date.strftime('%Y-%m-%d')}"
60
+ end
61
+
62
+ def raw_logs_prefix(date)
63
+ @raw_logs_prefix + date.strftime('%Y-%m-%d')
64
+ end
65
+
66
+ def raw_log_paths(date)
67
+ Dir.glob(File.join(raw_working_path, "#{raw_logs_prefix(date)}*")).sort
68
+ end
69
+
70
+ def remove_working_path
71
+ FileUtils.rm_r @working_path
72
+ end
73
+
74
+ def copy_processed_logs_to_working_folder(date_range)
75
+ date_range.each do |date|
76
+ name = processed_log_name(date) + ".gz"
77
+ source = File.join(@processed_logs_path, name)
78
+ destination = File.join(processed_working_path, name)
79
+ if File.exist? source
80
+ log "Copying #{name} to working folder"
81
+ FileUtils.copy(source, destination)
82
+ end
83
+ end
84
+ end
85
+
86
+ def download_raw_logs_to_working_folder(date)
87
+ s3 = RightAws::S3.new(*aws_keys)
88
+ bucket = s3.bucket(@raw_logs_bucket)
89
+ raise "Unknown bucket: #{@raw_logs_bucket}" if bucket.nil?
90
+
91
+ raw_logs = bucket.keys(:prefix => raw_logs_prefix(date))
92
+ raw_logs.each do |raw_log|
93
+ log "Getting #{raw_log.name}"
94
+ File.open(File.join(raw_working_path, raw_log.name), "w") do |file|
95
+ s3.interface.get(@raw_logs_bucket, raw_log.name) do |chunk|
96
+ file.write(chunk)
97
+ end
98
+ end
99
+ end
100
+ end
101
+
102
+ def unzip_raw(date)
103
+ log "Unzipping raw logs for #{date}"
104
+ raw_log_paths(date).each do |raw_log|
105
+ system "gunzip #{raw_log}" if raw_log.end_with?(".gz")
106
+ end
107
+ end
108
+
109
+ def merge_raw_into_processed(date)
110
+ start_time = Time.now
111
+ count = 0
112
+ out_files = {}
113
+ raw_log_paths(date).each do |raw_log|
114
+ if raw_log_already_processed?(raw_log)
115
+ log "Skipping #{raw_log}"
116
+ next
117
+ else
118
+ log "Processing #{raw_log}"
119
+ end
120
+ File.foreach raw_log do |line|
121
+ log_line = LogLine.new(line)
122
+ next unless log_line.valid?
123
+ date = log_line.date
124
+ name = File.join(processed_working_path, processed_log_name(date))
125
+ out_files[name] ||= File.open(name, "a")
126
+ out_files[name] << log_line.normalize
127
+ count += 1
128
+ end
129
+ end
130
+ ensure
131
+ out_files.each_value { |file| file.close }
132
+ log "#{count} rader på #{(Time.now - start_time).to_f}s (#{count/(Time.now - start_time).to_f} rader/s)"
133
+ end
134
+
135
+ def raw_log_already_processed?(log_file_name)
136
+ # Look for the last observation to see if it is already processed.
137
+ last_observation = `tail -n 1 #{log_file_name}`
138
+ log_line = LogLine.new(last_observation)
139
+ return false unless log_line.valid?
140
+ date = log_line.date
141
+ processed_file_name = File.join(processed_working_path, processed_log_name(date))
142
+ File.exist?(processed_file_name) && system("grep", "-qF", last_observation, processed_file_name.chomp)
143
+ end
144
+
145
+ def remove_raw_logs(date)
146
+ log "Removing raw logs for #{date}"
147
+ raw_log_paths(date).each do |raw_log|
148
+ FileUtils.rm(raw_log)
149
+ end
150
+ end
151
+
152
+ def sort_processed(date_range)
153
+ date_range.each do |date|
154
+ name = processed_log_name(date)
155
+ Dir.chdir processed_working_path do
156
+ next unless File.exist?(name)
157
+ log "Sorting #{name}"
158
+ ENV['LC_ALL'] = 'C'
159
+ ok = system "sort -t: -k2,4 #{name} | uniq > #{name}.sorted"
160
+ raise "Sort error!" unless ok
161
+ File.rename("#{name}.sorted", name)
162
+ end
163
+ end
164
+ end
165
+
166
+ def zip_processed(date_range)
167
+ log "Zipping processed files"
168
+ date_range.each do |date|
169
+ name = processed_log_name(date)
170
+ file = File.join(processed_working_path, name)
171
+ system "gzip #{file}" if File.exist? file
172
+ end
173
+ end
174
+
175
+ def unzip_processed(date_range)
176
+ log "Unzipping processed files"
177
+ date_range.each do |date|
178
+ name = processed_log_name(date) + ".gz"
179
+ file = File.join(processed_working_path, name)
180
+ system "gunzip #{file}" if File.exist? file
181
+ end
182
+ end
183
+
184
+ def move_processed_back(date_range)
185
+ date_range.each do |date|
186
+ name = processed_log_name(date) + ".gz"
187
+ source = File.join(processed_working_path, name)
188
+ destination = File.join(@processed_logs_path, name)
189
+ if File.exist? source
190
+ log "Moving #{name} back"
191
+ FileUtils.move(source, destination)
192
+ end
193
+ end
194
+ end
195
+
196
+ def log msg
197
+ unless defined?(TEST_RUN)
198
+ puts msg
199
+ end
200
+ end
201
+
202
+ DEFAULT_KEY_FILE = '/etc/s3_key.yml'
203
+ def aws_keys
204
+ if File.exists? DEFAULT_KEY_FILE
205
+ hash = YAML.load_file(DEFAULT_KEY_FILE)
206
+ [hash[:access_key_id], hash[:secret_access_key]]
207
+ else
208
+ access_key_id = ENV['OBSENTER_S3_KEY']
209
+ secret_access_key = secret_access_key_from_keychain!(access_key_id)
210
+ [access_key_id, secret_access_key]
211
+ end
212
+ end
213
+
214
+ # These two methods are borrowed from Awsborn
215
+ def secret_access_key_from_keychain! (key_id)
216
+ secret = secret_access_key_from_keychain key_id
217
+ raise "Could not find secret access key for #{key_id}" if secret.to_s == ''
218
+ secret
219
+ end
220
+
221
+ def secret_access_key_from_keychain (key_id)
222
+ @credentials ||= {}
223
+ unless @credentials[key_id]
224
+ dump = `security -q find-generic-password -a "#{key_id}" -g 2>&1`
225
+ secret_key = dump[/password: "(.*)"/, 1]
226
+ @credentials[key_id] = secret_key
227
+ end
228
+ @credentials[key_id]
229
+ end
230
+ end
231
+
232
+ class LogLine
233
+
234
+ def initialize(line)
235
+ @line = Logbox::StringEncoder.iconv(line)
236
+ end
237
+
238
+ def valid?
239
+ normalize
240
+ true
241
+ rescue
242
+ false
243
+ end
244
+
245
+ def normalize
246
+ normalize_s3_format
247
+ normalize_apache_format
248
+ normalize_timestamp
249
+ @line
250
+ end
251
+
252
+ TIMESTAMP_MATCHER = /(\d+)\/(\w+)\/(\d+):(\d+):(\d+):(\d+)\s([-+]?\d{2})/
253
+
254
+ def timestamp
255
+ unless @timestamp
256
+ match = @line.match(TIMESTAMP_MATCHER)
257
+ @timestamp = Time.utc(match[3], match[2], match[1], match[4], match[5], match[6])
258
+ @timestamp -= match[7].to_i * 3600 # Correct the zone. Works only on whole hours timezones.
259
+ end
260
+ @timestamp
261
+ end
262
+
263
+ def date
264
+ timestamp.send :to_date
265
+ end
266
+
267
+ def to_s
268
+ @line
269
+ end
270
+
271
+ private
272
+
273
+ S = '(\\S+)'
274
+ # Equivalent to /("(?:\\\\|\\"|[^\\"])*")/
275
+ Q = '("(?:\\\\\\\\|\\\\"|[^\\\\"])*")'
276
+ TIMESTAMP = '(\\[[^\\]]+\\])'
277
+
278
+ S3_FORMAT = Regexp.new('^' + [S,S,TIMESTAMP,S,S,S,S,S,Q,S,S,S,S,S,S,Q,Q,S].join(' '), 'm')
279
+ def normalize_s3_format
280
+ # %Q{owner bucket [16/Mar/2010:16:00:00 +0000] 85.225.221.221 requester requestID operation key "GET /log.gif?_item_id=987&_title=V%C3%A4skor%2FFodral&_url=http%3A%2F%2Fwww.24.se%2Fvaskorfodral-c-987-1.aspx%3Fsortorder%3D1%26direction%3D0%26defps%3D10%26pagesize%3D30%26pagenum%3D3%26useparams%3D0&a=Mozilla%2F5.0%20(Macintosh%3B%20U%3B%20Intel%20Mac%20OS%20X%2010.6%3B%20sv-SE%3B%20rv%3A1.9.2)%20Gecko%2F20100115%20Firefox%2F3.6&aid=jetshop&l=sv-se&n=netscape&o=view_tag&p=macintel&r=http%3A%2F%2Fwww.24.se%2Fvaskorfodral-c-987-1.aspx%3Fsortorder%3D1%26direction%3D0%26defps%3D10%26pagesize%3D30%26pagenum%3D2%26useparams%3D0&s=1280x800&sid=www.24.se&t=V%C3%A4skor%2FFodral&u=http%3A%2F%2Fwww.24.se%2Fvaskorfodral-c-987-1.aspx%3Fsortorder%3D1%26direction%3D0%26defps%3D10%26pagesize%3D30%26pagenum%3D3%26useparams%3D0&uid=1256057859704610385&x=32058&z=-60& HTTP/1.1" 200 - 35 35 6 5 "http://www.24.se/vaskorfodral-c-987-1.aspx?sortorder=1&direction=0&defps=10&pagesize=30&pagenum=3&useparams=0" "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; sv-SE; rv:1.9.2) Gecko/20100115 Firefox/3.6" -}
281
+ match = @line.match(S3_FORMAT)
282
+ if match
283
+ @line = %Q(#{match[4]} - - #{match[3]} #{match[9]} #{match[10]} #{match[12]} #{match[16]} #{match[17]} "-" "-"\n)
284
+ end
285
+ @line
286
+ end
287
+
288
+ APACHE_WITHOUT_COOKIES = Regexp.new('^' + [S,S,S,TIMESTAMP,Q,S,S,Q,Q].join(' ') + '$')
289
+ def normalize_apache_format
290
+ # Add third party cookies at end if they are not there.
291
+ # %Q{124.191.88.9 - - [26/May/2009:23:59:50 +0000] "GET /log.gif" "Mozilla/5.0"}
292
+ match = @line.match(APACHE_WITHOUT_COOKIES)
293
+ if match
294
+ @line = %Q(#{match[0]} "-" "-"\n)
295
+ end
296
+ @line
297
+ end
298
+
299
+ def normalize_timestamp
300
+ # 12/Apr/2010:09:07:23 +0200 => 12/Apr/2010:07:07:23 +0000
301
+ match = @line.match(/^(.*?\[)([^\]]+)(\].+)$/m)
302
+ unless match[2].end_with?('0000')
303
+ @line = "#{match[1]}#{timestamp.strftime('%d/%b/%Y:%H:%M:%S +0000')}#{match[3]}"
304
+ end
305
+ @line
306
+ end
307
+
308
+ end
309
+
310
+ end
311
+