logbox 0.2.10
Sign up to get free protection for your applications and to get access to all the features.
- data/.bundle/config +3 -0
- data/.rvmrc +2 -0
- data/Gemfile +17 -0
- data/Gemfile.lock +30 -0
- data/README +14 -0
- data/Rakefile +74 -0
- data/VERSION +1 -0
- data/bin/download_logs +20 -0
- data/bin/obsstats +39 -0
- data/bin/rotate +17 -0
- data/bin/viewobs +198 -0
- data/lib/logbox.rb +9 -0
- data/lib/logbox/ansi_colors.rb +28 -0
- data/lib/logbox/log_parser.rb +79 -0
- data/lib/logbox/mockup_log.rb +44 -0
- data/lib/logbox/observation.rb +162 -0
- data/lib/logbox/observation_compiler.rb +311 -0
- data/lib/logbox/observation_mover.rb +142 -0
- data/lib/logbox/stream_wrapper.rb +20 -0
- data/lib/logbox/stream_wrapper/gzip_multi_file.rb +90 -0
- data/lib/logbox/stream_wrapper/observation_filter.rb +113 -0
- data/lib/logbox/stream_wrapper/order_blob_splitter.rb +96 -0
- data/lib/setup_environment.rb +15 -0
- data/logbox.gemspec +110 -0
- data/test/bin_viewobs_test.rb +42 -0
- data/test/fixtures/aws_keys_yaml.txt +3 -0
- data/test/fixtures/double-obs.log +1 -0
- data/test/fixtures/error_line.log +1 -0
- data/test/fixtures/log-for-md5.log +1 -0
- data/test/fixtures/log0.log +0 -0
- data/test/fixtures/log1.log +1 -0
- data/test/fixtures/log1.log.gz +0 -0
- data/test/fixtures/log2.log +2 -0
- data/test/fixtures/log2.log.gz +0 -0
- data/test/fixtures/log_invalid_mixed_encoding.log +1 -0
- data/test/fixtures/observation_filter.log +5 -0
- data/test/fixtures/unquoted_ugliness.log +2 -0
- data/test/log_parser_test.rb +84 -0
- data/test/observation_compiler_test.rb +216 -0
- data/test/observation_mover_test.rb +135 -0
- data/test/observation_test.rb +114 -0
- data/test/stream_wrapper/gzip_multi_file_test.rb +147 -0
- data/test/stream_wrapper/observation_filter_test.rb +171 -0
- data/test/stream_wrapper/order_blob_splitter_test.rb +129 -0
- data/test/test_helper.rb +23 -0
- metadata +177 -0
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'date'
|
2
|
+
require 'cgi'
|
3
|
+
require 'stringio'
|
4
|
+
|
5
|
+
# Parses a standard web server log file stream and returns a hash with
|
6
|
+
# key/values for each line. Includes the Enumerable interface.
|
7
|
+
class LogParser
|
8
|
+
include Enumerable
|
9
|
+
|
10
|
+
# Support both strings and streams as input.
|
11
|
+
def initialize(input)
|
12
|
+
input = StringIO.new(input) if input.class == String
|
13
|
+
@stream = input
|
14
|
+
end
|
15
|
+
|
16
|
+
# Enumerable interface.
|
17
|
+
def each
|
18
|
+
while(observation = get_next_observation)
|
19
|
+
yield observation
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def get_next_observation
|
24
|
+
line = @stream.gets
|
25
|
+
line && LogParser.parse_line(line)
|
26
|
+
end
|
27
|
+
|
28
|
+
LOG_FORMAT = /([^ ]*) [^ ]* [^ ]* \[([^\]]*)\] "([^"]*)" ([^ ]*)/
|
29
|
+
LOG_DATE_FORMAT = "%d/%b/%Y:%H:%M:%S %z"
|
30
|
+
LOG_KEY_VALUE_FORMAT = /[?&]([^=]+)=([^&]+)/
|
31
|
+
SERVER_ATTRIBUTES = [:ip, :timestamp, :request, :status]
|
32
|
+
|
33
|
+
# Parse one log line and return a hash with all attributes.
|
34
|
+
def self.parse_line(line)
|
35
|
+
return nil if line.strip.empty?
|
36
|
+
|
37
|
+
line =~ LOG_FORMAT
|
38
|
+
result = {}
|
39
|
+
|
40
|
+
# Save ip, timestamp and request.
|
41
|
+
result[:ip] = $1
|
42
|
+
begin
|
43
|
+
result[:timestamp] = DateTime.strptime($2, LOG_DATE_FORMAT)
|
44
|
+
rescue ArgumentError
|
45
|
+
raise ParseError.new("Error while parsing timestamp")
|
46
|
+
end
|
47
|
+
result[:request] = $3
|
48
|
+
result[:status] = $4
|
49
|
+
|
50
|
+
# Extract key/values pairs from the query part of the request.
|
51
|
+
$3.scan(LOG_KEY_VALUE_FORMAT) do |key, value|
|
52
|
+
begin
|
53
|
+
key = CGI.unescape(key).to_sym
|
54
|
+
value = CGI.unescape(value)
|
55
|
+
rescue Encoding::CompatibilityError => e
|
56
|
+
raise ParseError.new("Error while parsing query parameters")
|
57
|
+
end
|
58
|
+
|
59
|
+
if result.has_key? key
|
60
|
+
if result[key].is_a? Array
|
61
|
+
result[key] << value
|
62
|
+
else
|
63
|
+
result[key] = [result[key], value]
|
64
|
+
end
|
65
|
+
else
|
66
|
+
result[key] = value
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
return result
|
71
|
+
rescue ParseError
|
72
|
+
raise
|
73
|
+
rescue
|
74
|
+
raise ParseError.new("Unknown parsing error")
|
75
|
+
end
|
76
|
+
class ParseError < StandardError ; end
|
77
|
+
end
|
78
|
+
|
79
|
+
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'time'
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
module MockupLog
|
5
|
+
|
6
|
+
# Create a custom log file string.
|
7
|
+
def mockup_log lines_data
|
8
|
+
template_lines = {
|
9
|
+
:visit_page => %Q'72.211.248.18 - - [02/Mar/2009:06:58:12 +0100] "GET /log.gif?a=Mozilla%2F5.0%20(Macintosh%3B%20U%3B%20Intel%20Mac%20OS%20X%2010_5_5%3B%20en-us)%20AppleWebKit%2F525.27.1%20(KHTML%2C%20like%20Gecko)%20Version%2F3.2.1%20Safari%2F525.27.1&aid=xroads&l=en-us&n=netscape&o=visit_page&p=macintel&r=http%3A%2F%2Fdesignhousestockholm.xroads.se%2FPortal%2FProducts%2FProductGroup.asp%3Fclick%3D1%26pageIndex%3D14%26type%3Dgeneric&s=1920x1200&sid=DHS_Shop&t=Product&u=http%3A%2F%2Fdesignhousestockholm.xroads.se%2FPortal%2FProducts%2FProduct.asp%3FitemId%3D6005&uid=1235973461091911420&x=35412&z=480& HTTP/1.1" 200 35 "http://designhousestockholm.xroads.se/Portal/Products/Product.asp?itemId=6005" "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_5; en-us) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1" "-" "-"\n',
|
10
|
+
:view_item => %Q'72.211.248.18 - - [02/Mar/2009:06:59:32 +0100] "GET /log.gif?_current_price=US%2471&_image=..%2F..%2FArchive%2FImages%2FWebshop%2FProducts%2FDesignhouseStockholm%2F1132-1000_stor.jpg&_item_id=6005&_sku=HHAW09%20Shelter%20Pants&_thumbnail=..%2F..%2FArchive%2FImages%2FWebshop%2FProducts%2FDesignhouseStockholm%2F1132-1000_liten.jpg&_title=%C2%A0PLEECE%20-%20HOOD&_url=http%3A%2F%2Fdesignhousestockholm.xroads.se%2FPortal%2FProducts%2FProduct.asp%3FitemId%3D6005&a=Mozilla%2F5.0%20(Macintosh%3B%20U%3B%20Intel%20Mac%20OS%20X%2010_5_5%3B%20en-us)%20AppleWebKit%2F525.27.1%20(KHTML%2C%20like%20Gecko)%20Version%2F3.2.1%20Safari%2F525.27.1&aid=xroads&l=en-us&n=netscape&o=view_item&p=macintel&r=http%3A%2F%2Fdesignhousestockholm.xroads.se%2FPortal%2FProducts%2FProductGroup.asp%3Fclick%3D1%26pageIndex%3D14%26type%3Dgeneric&s=1920x1200&sid=DHS_Shop&t=Product&u=http%3A%2F%2Fdesignhousestockholm.xroads.se%2FPortal%2FProducts%2FProduct.asp%3FitemId%3D6005&uid=1235973461091911420&x=35412&z=480& HTTP/1.1" 200 35 "http://designhousestockholm.xroads.se/Portal/Products/Product.asp?itemId=6005" "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_5; en-us) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1" "-" "-"\n',
|
11
|
+
:pick_item => %Q'72.211.248.18 - - [02/Mar/2009:07:00:28 +0100] "GET /log.gif?_item_id=6004&a=Mozilla%2F5.0%20(Macintosh%3B%20U%3B%20Intel%20Mac%20OS%20X%2010_5_5%3B%20en-us)%20AppleWebKit%2F525.27.1%20(KHTML%2C%20like%20Gecko)%20Version%2F3.2.1%20Safari%2F525.27.1&aid=xroads&e=click&l=en-us&n=netscape&o=pick_item&p=macintel&r=http%3A%2F%2Fdesignhousestockholm.xroads.se%2FPortal%2FProducts%2FProductGroup.asp%3Fclick%3D1%26pageIndex%3D14%26type%3Dgeneric&s=1920x1200&sid=DHS_Shop&t=Product&u=http%3A%2F%2Fdesignhousestockholm.xroads.se%2FPortal%2FProducts%2FProduct.asp%3FitemId%3D6004&uid=1235973461091911420&x=98033&z=480& HTTP/1.1" 200 35 "http://designhousestockholm.xroads.se/Portal/Products/Product.asp?itemId=6004" "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_5; en-us) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1" "-" "-"\n',
|
12
|
+
:buy_item => %Q'72.211.248.18 - - [02/Mar/2009:07:01:28 +0100] "GET /log.gif?_current_price=1250&_order_id=13945&_quantity=1&_sku=FERTILITETSMONITOR&_title=Fertilitetsmonitor+fr%C3%A5n+Clearblue&a=Mozilla%2F4.0%20(compatible%3B%20MSIE%207.0%3B%20Windows%20NT%205.1%3B%20.NET%20CLR%201.1.4322%3B%20.NET%20CLR%202.0.50727%3B%20.NET%20CLR%203.0.4506.2152%3B%20.NET%20CLR%203.5.30729)&aid=jetshop&l=sv&n=microsoft%20internet%20explorer&o=buy_item&p=win32&s=1280x960&sid=www.medistore.se&t=H%C3%A4r%20hittar%20du%20sjukv%C3%A5rdsprodukter%2C%20blodtrycksm%C3%A4tare%2C%20f%C3%B6rsta%20hj%C3%A4lpen%20och%20pulsklockor&u=https%3A%2F%2Fwww.medistore.se%2FOrderDetailsConfirmed.aspx&uid=1268173983791130948&x=68410&z=-60& HTTP/1.1" 200 35 "http://designhousestockholm.xroads.se/Portal/Products/Product.asp?itemId=6004" "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_5; en-us) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1" "-" "-"\n',
|
13
|
+
:heed_recommendation => %Q'72.211.248.18 - - [02/Mar/2009:07:01:28 +0100] "GET /log.gif?_current_price=189%2C00%20kr&_item_id=1441&_rec_for=2407&_rec_type=viewed_this_also_viewed&_recommended_ids%5B0%5D=1441&_recommended_ids%5B1%5D=960&_recommended_ids%5B2%5D=535&_url=http%3A%2F%2Fwww.shirtstore.se%2Fits-on-like-donkey-kong-p-1441.aspx&a=Mozilla%2F4.0%20(compatible%3B%20MSIE%208.0%3B%20Windows%20NT%206.1%3B%20Trident%2F4.0%3B%20GTB6.4%3B%20SLCC2%3B%20.NET%20CLR%202.0.50727%3B%20.NET%20CLR%203.5.30729%3B%20.NET%20CLR%203.0.30729%3B%20Media%20Center%20PC%206.0)&aid=jetshop&e=click&l=sv&n=microsoft%20internet%20explorer&o=heed_recommendation&p=win32&r=http%3A%2F%2Fwww.shirtstore.se%2Ftshirts-c-491-1.aspx&s=1280x800&sid=www.shirtstore.se&t=The%20Hulk%20Distressed&u=http%3A%2F%2Fwww.shirtstore.se%2Fthe-hulk-distressed-p-2407-c-491.aspx&uid=1266087502030697666&x=55724&z=-60& HTTP/1.1" 200 35 "http://designhousestockholm.xroads.se/Portal/Products/Product.asp?itemId=6004" "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_5; en-us) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1" "-" "-"\n',
|
14
|
+
}
|
15
|
+
log = ""
|
16
|
+
lines_data.each do |line_data|
|
17
|
+
line_data = {:aid => "xroads", :sid => "Fortum", :uid => "12345", :timestamp => Time.parse("2009-03-02 07:01:28")}.merge(line_data)
|
18
|
+
observation_type = line_data.delete(:o) || raise("Missing observation type attribute (:o => 'xx') in line data.")
|
19
|
+
raise "Has no template for this observation type" unless template_lines[observation_type.to_sym]
|
20
|
+
# Construct the log line.
|
21
|
+
line = template_lines[observation_type.to_sym].dup
|
22
|
+
line_data.each do |key, value|
|
23
|
+
case key
|
24
|
+
when :timestamp then
|
25
|
+
line.sub!(/\[[^\]]+?\]/, value.nil? ? '' : value.strftime('[%d/%b/%Y:%H:%M:%S +0000]'))
|
26
|
+
when :ip then
|
27
|
+
line.sub!(/^\d+\.\d+\.\d+\.\d+/, value)
|
28
|
+
else
|
29
|
+
value = URI.escape(value, /[^-_.!~*'()a-zA-Z\d]/n)
|
30
|
+
line.gsub!(Regexp.new("#{key.to_s}=[^&]+&", true), "#{key.to_s}=#{value}&")
|
31
|
+
end
|
32
|
+
end
|
33
|
+
log << line
|
34
|
+
end
|
35
|
+
log
|
36
|
+
end
|
37
|
+
|
38
|
+
def mockup_log_file(path, lines_data)
|
39
|
+
File.open(path, "w") do |file|
|
40
|
+
file.write(mockup_log(lines_data))
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
@@ -0,0 +1,162 @@
|
|
1
|
+
require 'uri'
|
2
|
+
require 'log_parser'
|
3
|
+
|
4
|
+
class Observation
|
5
|
+
|
6
|
+
BASIC_ATTRIBUTES_NAMES = {
|
7
|
+
:aid => :account_id,
|
8
|
+
:sid => :shop_id,
|
9
|
+
:uid => :user_id,
|
10
|
+
:u => :document_url,
|
11
|
+
:r => :referrer,
|
12
|
+
:t => :document_title,
|
13
|
+
:s => :screen_resolution,
|
14
|
+
:l => :browser_language,
|
15
|
+
:p => :platform,
|
16
|
+
:a => :user_agent,
|
17
|
+
:h => :history_count,
|
18
|
+
:n => :navigator_name,
|
19
|
+
:z => :time_zone_offset,
|
20
|
+
:x => :seed,
|
21
|
+
:o => :observation_type
|
22
|
+
}
|
23
|
+
|
24
|
+
OBSERVATION_ATTRIBUTES_NAMES = {
|
25
|
+
:_item_id => :item_id,
|
26
|
+
:_rec_for => :rec_for,
|
27
|
+
:_title => :title,
|
28
|
+
:_url => :url,
|
29
|
+
:_basket_url => :basket_url,
|
30
|
+
:_description => :description,
|
31
|
+
:_normal_price => :normal_price,
|
32
|
+
:_current_price => :current_price,
|
33
|
+
:_thumbnail => :thumbnail,
|
34
|
+
:_image => :image,
|
35
|
+
:_stock => :stock,
|
36
|
+
:_expires => :expires,
|
37
|
+
:_order_blob => :order_blob,
|
38
|
+
:_order_id => :order_id,
|
39
|
+
:_sku => :sku,
|
40
|
+
:_quantity => :quantity,
|
41
|
+
:_tags => :tags,
|
42
|
+
:_user1 => :user1,
|
43
|
+
:_user2 => :user2,
|
44
|
+
:_user3 => :user3,
|
45
|
+
:_user4 => :user4,
|
46
|
+
:_user5 => :user5,
|
47
|
+
:_culture => :culture,
|
48
|
+
:_currency => :currency,
|
49
|
+
:_group_ref => :group_ref,
|
50
|
+
:_with_vat => :with_vat
|
51
|
+
}
|
52
|
+
BASIC_ATTRIBUTES = BASIC_ATTRIBUTES_NAMES.values
|
53
|
+
OBSERVATION_ATTRIBUTES = OBSERVATION_ATTRIBUTES_NAMES.values
|
54
|
+
ITEM_ATTRIBUTES = OBSERVATION_ATTRIBUTES
|
55
|
+
OBSERVATION_TYPE_STRINGS = %w[visit_page view_item pick_item buy_basket buy_item view_recommendation heed_recommendation]
|
56
|
+
OBSERVATION_TYPES = OBSERVATION_TYPE_STRINGS.map { |e| e.to_sym }
|
57
|
+
# Konverterade med ett stort perl-uttryck:
|
58
|
+
# perl -pi -e 's/([&?])_id=/\1_item_id=/g; s/currentprice/current_price/g; s/normalprice/normal_price/g; s/basketurl/basket_url/g; s/([&?])_type=/\1o=/g; s/buyitem/pick_item/g; s/viewitem/view_item/g; s/searchclick/search_pick_item/g; s/sid=bwintest/aid=bwin/g; s/sid=/aid=crossroads&sid=/g; s/aid=bwin/aid=bwin&sid=bwintest/g;' observer.access.log.*
|
59
|
+
|
60
|
+
attr_reader :attributes, :unknown_attributes, :type, :errors
|
61
|
+
|
62
|
+
def initialize(logline_attributes)
|
63
|
+
transfer_attributes(logline_attributes.clone)
|
64
|
+
validate_type
|
65
|
+
ensure_correct_urls
|
66
|
+
end
|
67
|
+
|
68
|
+
def valid?
|
69
|
+
@errors.empty?
|
70
|
+
end
|
71
|
+
|
72
|
+
def [](key)
|
73
|
+
@attributes[key]
|
74
|
+
end
|
75
|
+
|
76
|
+
def view_item?
|
77
|
+
@type == :view_item
|
78
|
+
end
|
79
|
+
|
80
|
+
def pick_item?
|
81
|
+
@type == :pick_item
|
82
|
+
end
|
83
|
+
|
84
|
+
def buy_item?
|
85
|
+
@type == :buy_item
|
86
|
+
end
|
87
|
+
|
88
|
+
private
|
89
|
+
|
90
|
+
# Takes the input hash and moves all known attributes to the @attributes
|
91
|
+
# hash, renaming some of them. Unknown attributes are also saved.
|
92
|
+
def transfer_attributes(logline_attributes)
|
93
|
+
@attributes = {}
|
94
|
+
|
95
|
+
# Transfer basic attributes.
|
96
|
+
(BASIC_ATTRIBUTES_NAMES.to_a + OBSERVATION_ATTRIBUTES_NAMES.to_a).each do |key, new_key|
|
97
|
+
value = logline_attributes.delete(key)
|
98
|
+
@attributes[new_key] = value if value
|
99
|
+
end
|
100
|
+
|
101
|
+
# Transfer server attributes.
|
102
|
+
LogParser::SERVER_ATTRIBUTES.each do |key|
|
103
|
+
value = logline_attributes.delete(key)
|
104
|
+
@attributes[key] = value if value
|
105
|
+
end
|
106
|
+
|
107
|
+
@unknown_attributes = logline_attributes
|
108
|
+
end
|
109
|
+
|
110
|
+
# Ensure that URLs are absolute and don't contain the anchor part.
|
111
|
+
def ensure_correct_urls
|
112
|
+
if @attributes.has_key? :document_url
|
113
|
+
@errors ||= []
|
114
|
+
[:url, :thumbnail, :image, :basket_url].each do |key|
|
115
|
+
begin
|
116
|
+
if @attributes.has_key? key
|
117
|
+
# Ensure that the url is not encoded (it is ok to unencode a url that is not encoded).
|
118
|
+
url = URI.unescape(@attributes[key])
|
119
|
+
# But we store and handle all urls encoded (according to the RFC). URI.join also requires it.
|
120
|
+
url = URI.escape(url)
|
121
|
+
base_url = URI.escape(@attributes[:document_url])
|
122
|
+
# Ensure absoulte.
|
123
|
+
full_url = URI.join(base_url, url).to_s
|
124
|
+
# Ensure no debug anchor. Just to make sure that #debug urls do not reach Dumbo.
|
125
|
+
full_url.gsub!(/%23debug(on|off)?/, '')
|
126
|
+
@attributes[key] = full_url
|
127
|
+
end
|
128
|
+
rescue
|
129
|
+
@errors << "url_error_on_#{key}".to_sym
|
130
|
+
end
|
131
|
+
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
REQUIRED_ATTRIBUTES = [:user_id, :account_id, :shop_id, :item_id]
|
137
|
+
REQUIRED_ATTRIBUTES_FOR_TYPE = {
|
138
|
+
:buy_basket => [:user_id, :account_id, :shop_id],
|
139
|
+
:buy_item => [:user_id, :account_id, :shop_id, :sku],
|
140
|
+
:visit_page => [:user_id, :account_id, :shop_id],
|
141
|
+
:view_recommendation => [:user_id, :account_id, :shop_id, :rec_for],
|
142
|
+
:heed_recommendation => [:user_id, :account_id, :shop_id, :item_id]
|
143
|
+
}
|
144
|
+
|
145
|
+
def validate_type
|
146
|
+
@type = :unknown
|
147
|
+
@errors ||= []
|
148
|
+
|
149
|
+
# Only handle successfull requests.
|
150
|
+
@errors << :status_not_ok and return unless @attributes[:status] == "200"
|
151
|
+
|
152
|
+
# Only handle types we know of.
|
153
|
+
@errors << :unknown_type and return unless OBSERVATION_TYPE_STRINGS.include? @attributes[:observation_type]
|
154
|
+
@type = @attributes[:observation_type].to_sym
|
155
|
+
|
156
|
+
# For all valid types we require some attributes.
|
157
|
+
(REQUIRED_ATTRIBUTES_FOR_TYPE[@type] || REQUIRED_ATTRIBUTES).each do |a|
|
158
|
+
@errors << "missing_#{a}".to_sym unless @attributes[a] =~ /.+/
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
end
|
@@ -0,0 +1,311 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require 'rubygems'
|
3
|
+
require 'time'
|
4
|
+
require 'date'
|
5
|
+
require 'fileutils'
|
6
|
+
require 'right_aws'
|
7
|
+
require 'rake'
|
8
|
+
require 'yaml'
|
9
|
+
require 'logbox'
|
10
|
+
|
11
|
+
module ObservationCompiler
|
12
|
+
|
13
|
+
class Job
|
14
|
+
|
15
|
+
def initialize(options = {})
|
16
|
+
@raw_logs_bucket = options[:raw_logs_bucket] || "rwdata-logs"
|
17
|
+
@raw_logs_prefix = options[:raw_logs_prefix] || "observer-log-"
|
18
|
+
@processed_logs_path = options[:processed_logs_path] || "local_files"
|
19
|
+
temp_dir = File.exist?("/apps/smartass") ? "/apps/smartass/tmp/" : "/tmp/"
|
20
|
+
@working_path = options[:working_path] ||= "#{temp_dir}observation_compiler/#{Process.pid}"
|
21
|
+
end
|
22
|
+
|
23
|
+
def fetch_and_merge(raw_date_range)
|
24
|
+
# A raw log-file for a date may contain observations from the day before.
|
25
|
+
processed_date_range = (raw_date_range.first-1)..(raw_date_range.last)
|
26
|
+
|
27
|
+
create_working_folders
|
28
|
+
copy_processed_logs_to_working_folder(processed_date_range)
|
29
|
+
unzip_processed(processed_date_range)
|
30
|
+
|
31
|
+
raw_date_range.each do |date|
|
32
|
+
download_raw_logs_to_working_folder(date)
|
33
|
+
unzip_raw(date)
|
34
|
+
merge_raw_into_processed(date)
|
35
|
+
remove_raw_logs(date)
|
36
|
+
end
|
37
|
+
|
38
|
+
ensure
|
39
|
+
sort_processed(processed_date_range)
|
40
|
+
zip_processed(processed_date_range)
|
41
|
+
move_processed_back(processed_date_range)
|
42
|
+
remove_working_path
|
43
|
+
end
|
44
|
+
|
45
|
+
def create_working_folders
|
46
|
+
FileUtils.mkdir_p(raw_working_path)
|
47
|
+
FileUtils.mkdir_p(processed_working_path)
|
48
|
+
end
|
49
|
+
|
50
|
+
def raw_working_path
|
51
|
+
File.join(@working_path, "r")
|
52
|
+
end
|
53
|
+
|
54
|
+
def processed_working_path
|
55
|
+
File.join(@working_path, "p")
|
56
|
+
end
|
57
|
+
|
58
|
+
def processed_log_name(date)
|
59
|
+
"observer-log-#{date.strftime('%Y-%m-%d')}"
|
60
|
+
end
|
61
|
+
|
62
|
+
def raw_logs_prefix(date)
|
63
|
+
@raw_logs_prefix + date.strftime('%Y-%m-%d')
|
64
|
+
end
|
65
|
+
|
66
|
+
def raw_log_paths(date)
|
67
|
+
Dir.glob(File.join(raw_working_path, "#{raw_logs_prefix(date)}*")).sort
|
68
|
+
end
|
69
|
+
|
70
|
+
def remove_working_path
|
71
|
+
FileUtils.rm_r @working_path
|
72
|
+
end
|
73
|
+
|
74
|
+
def copy_processed_logs_to_working_folder(date_range)
|
75
|
+
date_range.each do |date|
|
76
|
+
name = processed_log_name(date) + ".gz"
|
77
|
+
source = File.join(@processed_logs_path, name)
|
78
|
+
destination = File.join(processed_working_path, name)
|
79
|
+
if File.exist? source
|
80
|
+
log "Copying #{name} to working folder"
|
81
|
+
FileUtils.copy(source, destination)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def download_raw_logs_to_working_folder(date)
|
87
|
+
s3 = RightAws::S3.new(*aws_keys)
|
88
|
+
bucket = s3.bucket(@raw_logs_bucket)
|
89
|
+
raise "Unknown bucket: #{@raw_logs_bucket}" if bucket.nil?
|
90
|
+
|
91
|
+
raw_logs = bucket.keys(:prefix => raw_logs_prefix(date))
|
92
|
+
raw_logs.each do |raw_log|
|
93
|
+
log "Getting #{raw_log.name}"
|
94
|
+
File.open(File.join(raw_working_path, raw_log.name), "w") do |file|
|
95
|
+
s3.interface.get(@raw_logs_bucket, raw_log.name) do |chunk|
|
96
|
+
file.write(chunk)
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def unzip_raw(date)
|
103
|
+
log "Unzipping raw logs for #{date}"
|
104
|
+
raw_log_paths(date).each do |raw_log|
|
105
|
+
system "gunzip #{raw_log}" if raw_log.end_with?(".gz")
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def merge_raw_into_processed(date)
|
110
|
+
start_time = Time.now
|
111
|
+
count = 0
|
112
|
+
out_files = {}
|
113
|
+
raw_log_paths(date).each do |raw_log|
|
114
|
+
if raw_log_already_processed?(raw_log)
|
115
|
+
log "Skipping #{raw_log}"
|
116
|
+
next
|
117
|
+
else
|
118
|
+
log "Processing #{raw_log}"
|
119
|
+
end
|
120
|
+
File.foreach raw_log do |line|
|
121
|
+
log_line = LogLine.new(line)
|
122
|
+
next unless log_line.valid?
|
123
|
+
date = log_line.date
|
124
|
+
name = File.join(processed_working_path, processed_log_name(date))
|
125
|
+
out_files[name] ||= File.open(name, "a")
|
126
|
+
out_files[name] << log_line.normalize
|
127
|
+
count += 1
|
128
|
+
end
|
129
|
+
end
|
130
|
+
ensure
|
131
|
+
out_files.each_value { |file| file.close }
|
132
|
+
log "#{count} rader på #{(Time.now - start_time).to_f}s (#{count/(Time.now - start_time).to_f} rader/s)"
|
133
|
+
end
|
134
|
+
|
135
|
+
def raw_log_already_processed?(log_file_name)
|
136
|
+
# Look for the last observation to see if it is already processed.
|
137
|
+
last_observation = `tail -n 1 #{log_file_name}`
|
138
|
+
log_line = LogLine.new(last_observation)
|
139
|
+
return false unless log_line.valid?
|
140
|
+
date = log_line.date
|
141
|
+
processed_file_name = File.join(processed_working_path, processed_log_name(date))
|
142
|
+
File.exist?(processed_file_name) && system("grep", "-qF", last_observation, processed_file_name.chomp)
|
143
|
+
end
|
144
|
+
|
145
|
+
def remove_raw_logs(date)
|
146
|
+
log "Removing raw logs for #{date}"
|
147
|
+
raw_log_paths(date).each do |raw_log|
|
148
|
+
FileUtils.rm(raw_log)
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
def sort_processed(date_range)
|
153
|
+
date_range.each do |date|
|
154
|
+
name = processed_log_name(date)
|
155
|
+
Dir.chdir processed_working_path do
|
156
|
+
next unless File.exist?(name)
|
157
|
+
log "Sorting #{name}"
|
158
|
+
ENV['LC_ALL'] = 'C'
|
159
|
+
ok = system "sort -t: -k2,4 #{name} | uniq > #{name}.sorted"
|
160
|
+
raise "Sort error!" unless ok
|
161
|
+
File.rename("#{name}.sorted", name)
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
def zip_processed(date_range)
|
167
|
+
log "Zipping processed files"
|
168
|
+
date_range.each do |date|
|
169
|
+
name = processed_log_name(date)
|
170
|
+
file = File.join(processed_working_path, name)
|
171
|
+
system "gzip #{file}" if File.exist? file
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
def unzip_processed(date_range)
|
176
|
+
log "Unzipping processed files"
|
177
|
+
date_range.each do |date|
|
178
|
+
name = processed_log_name(date) + ".gz"
|
179
|
+
file = File.join(processed_working_path, name)
|
180
|
+
system "gunzip #{file}" if File.exist? file
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
def move_processed_back(date_range)
|
185
|
+
date_range.each do |date|
|
186
|
+
name = processed_log_name(date) + ".gz"
|
187
|
+
source = File.join(processed_working_path, name)
|
188
|
+
destination = File.join(@processed_logs_path, name)
|
189
|
+
if File.exist? source
|
190
|
+
log "Moving #{name} back"
|
191
|
+
FileUtils.move(source, destination)
|
192
|
+
end
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
def log msg
|
197
|
+
unless defined?(TEST_RUN)
|
198
|
+
puts msg
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
DEFAULT_KEY_FILE = '/etc/s3_key.yml'
|
203
|
+
def aws_keys
|
204
|
+
if File.exists? DEFAULT_KEY_FILE
|
205
|
+
hash = YAML.load_file(DEFAULT_KEY_FILE)
|
206
|
+
[hash[:access_key_id], hash[:secret_access_key]]
|
207
|
+
else
|
208
|
+
access_key_id = ENV['OBSENTER_S3_KEY']
|
209
|
+
secret_access_key = secret_access_key_from_keychain!(access_key_id)
|
210
|
+
[access_key_id, secret_access_key]
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
# These two methods are borrowed from Awsborn
|
215
|
+
def secret_access_key_from_keychain! (key_id)
|
216
|
+
secret = secret_access_key_from_keychain key_id
|
217
|
+
raise "Could not find secret access key for #{key_id}" if secret.to_s == ''
|
218
|
+
secret
|
219
|
+
end
|
220
|
+
|
221
|
+
def secret_access_key_from_keychain (key_id)
|
222
|
+
@credentials ||= {}
|
223
|
+
unless @credentials[key_id]
|
224
|
+
dump = `security -q find-generic-password -a "#{key_id}" -g 2>&1`
|
225
|
+
secret_key = dump[/password: "(.*)"/, 1]
|
226
|
+
@credentials[key_id] = secret_key
|
227
|
+
end
|
228
|
+
@credentials[key_id]
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
class LogLine
|
233
|
+
|
234
|
+
def initialize(line)
|
235
|
+
@line = Logbox::StringEncoder.iconv(line)
|
236
|
+
end
|
237
|
+
|
238
|
+
def valid?
|
239
|
+
normalize
|
240
|
+
true
|
241
|
+
rescue
|
242
|
+
false
|
243
|
+
end
|
244
|
+
|
245
|
+
def normalize
|
246
|
+
normalize_s3_format
|
247
|
+
normalize_apache_format
|
248
|
+
normalize_timestamp
|
249
|
+
@line
|
250
|
+
end
|
251
|
+
|
252
|
+
TIMESTAMP_MATCHER = /(\d+)\/(\w+)\/(\d+):(\d+):(\d+):(\d+)\s([-+]?\d{2})/
|
253
|
+
|
254
|
+
def timestamp
|
255
|
+
unless @timestamp
|
256
|
+
match = @line.match(TIMESTAMP_MATCHER)
|
257
|
+
@timestamp = Time.utc(match[3], match[2], match[1], match[4], match[5], match[6])
|
258
|
+
@timestamp -= match[7].to_i * 3600 # Correct the zone. Works only on whole hours timezones.
|
259
|
+
end
|
260
|
+
@timestamp
|
261
|
+
end
|
262
|
+
|
263
|
+
def date
|
264
|
+
timestamp.send :to_date
|
265
|
+
end
|
266
|
+
|
267
|
+
def to_s
|
268
|
+
@line
|
269
|
+
end
|
270
|
+
|
271
|
+
private
|
272
|
+
|
273
|
+
S = '(\\S+)'
|
274
|
+
# Equivalent to /("(?:\\\\|\\"|[^\\"])*")/
|
275
|
+
Q = '("(?:\\\\\\\\|\\\\"|[^\\\\"])*")'
|
276
|
+
TIMESTAMP = '(\\[[^\\]]+\\])'
|
277
|
+
|
278
|
+
S3_FORMAT = Regexp.new('^' + [S,S,TIMESTAMP,S,S,S,S,S,Q,S,S,S,S,S,S,Q,Q,S].join(' '), 'm')
|
279
|
+
def normalize_s3_format
|
280
|
+
# %Q{owner bucket [16/Mar/2010:16:00:00 +0000] 85.225.221.221 requester requestID operation key "GET /log.gif?_item_id=987&_title=V%C3%A4skor%2FFodral&_url=http%3A%2F%2Fwww.24.se%2Fvaskorfodral-c-987-1.aspx%3Fsortorder%3D1%26direction%3D0%26defps%3D10%26pagesize%3D30%26pagenum%3D3%26useparams%3D0&a=Mozilla%2F5.0%20(Macintosh%3B%20U%3B%20Intel%20Mac%20OS%20X%2010.6%3B%20sv-SE%3B%20rv%3A1.9.2)%20Gecko%2F20100115%20Firefox%2F3.6&aid=jetshop&l=sv-se&n=netscape&o=view_tag&p=macintel&r=http%3A%2F%2Fwww.24.se%2Fvaskorfodral-c-987-1.aspx%3Fsortorder%3D1%26direction%3D0%26defps%3D10%26pagesize%3D30%26pagenum%3D2%26useparams%3D0&s=1280x800&sid=www.24.se&t=V%C3%A4skor%2FFodral&u=http%3A%2F%2Fwww.24.se%2Fvaskorfodral-c-987-1.aspx%3Fsortorder%3D1%26direction%3D0%26defps%3D10%26pagesize%3D30%26pagenum%3D3%26useparams%3D0&uid=1256057859704610385&x=32058&z=-60& HTTP/1.1" 200 - 35 35 6 5 "http://www.24.se/vaskorfodral-c-987-1.aspx?sortorder=1&direction=0&defps=10&pagesize=30&pagenum=3&useparams=0" "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; sv-SE; rv:1.9.2) Gecko/20100115 Firefox/3.6" -}
|
281
|
+
match = @line.match(S3_FORMAT)
|
282
|
+
if match
|
283
|
+
@line = %Q(#{match[4]} - - #{match[3]} #{match[9]} #{match[10]} #{match[12]} #{match[16]} #{match[17]} "-" "-"\n)
|
284
|
+
end
|
285
|
+
@line
|
286
|
+
end
|
287
|
+
|
288
|
+
APACHE_WITHOUT_COOKIES = Regexp.new('^' + [S,S,S,TIMESTAMP,Q,S,S,Q,Q].join(' ') + '$')
|
289
|
+
def normalize_apache_format
|
290
|
+
# Add third party cookies at end if they are not there.
|
291
|
+
# %Q{124.191.88.9 - - [26/May/2009:23:59:50 +0000] "GET /log.gif" "Mozilla/5.0"}
|
292
|
+
match = @line.match(APACHE_WITHOUT_COOKIES)
|
293
|
+
if match
|
294
|
+
@line = %Q(#{match[0]} "-" "-"\n)
|
295
|
+
end
|
296
|
+
@line
|
297
|
+
end
|
298
|
+
|
299
|
+
def normalize_timestamp
|
300
|
+
# 12/Apr/2010:09:07:23 +0200 => 12/Apr/2010:07:07:23 +0000
|
301
|
+
match = @line.match(/^(.*?\[)([^\]]+)(\].+)$/m)
|
302
|
+
unless match[2].end_with?('0000')
|
303
|
+
@line = "#{match[1]}#{timestamp.strftime('%d/%b/%Y:%H:%M:%S +0000')}#{match[3]}"
|
304
|
+
end
|
305
|
+
@line
|
306
|
+
end
|
307
|
+
|
308
|
+
end
|
309
|
+
|
310
|
+
end
|
311
|
+
|