logbox 0.2.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.bundle/config +3 -0
- data/.rvmrc +2 -0
- data/Gemfile +17 -0
- data/Gemfile.lock +30 -0
- data/README +14 -0
- data/Rakefile +74 -0
- data/VERSION +1 -0
- data/bin/download_logs +20 -0
- data/bin/obsstats +39 -0
- data/bin/rotate +17 -0
- data/bin/viewobs +198 -0
- data/lib/logbox.rb +9 -0
- data/lib/logbox/ansi_colors.rb +28 -0
- data/lib/logbox/log_parser.rb +79 -0
- data/lib/logbox/mockup_log.rb +44 -0
- data/lib/logbox/observation.rb +162 -0
- data/lib/logbox/observation_compiler.rb +311 -0
- data/lib/logbox/observation_mover.rb +142 -0
- data/lib/logbox/stream_wrapper.rb +20 -0
- data/lib/logbox/stream_wrapper/gzip_multi_file.rb +90 -0
- data/lib/logbox/stream_wrapper/observation_filter.rb +113 -0
- data/lib/logbox/stream_wrapper/order_blob_splitter.rb +96 -0
- data/lib/setup_environment.rb +15 -0
- data/logbox.gemspec +110 -0
- data/test/bin_viewobs_test.rb +42 -0
- data/test/fixtures/aws_keys_yaml.txt +3 -0
- data/test/fixtures/double-obs.log +1 -0
- data/test/fixtures/error_line.log +1 -0
- data/test/fixtures/log-for-md5.log +1 -0
- data/test/fixtures/log0.log +0 -0
- data/test/fixtures/log1.log +1 -0
- data/test/fixtures/log1.log.gz +0 -0
- data/test/fixtures/log2.log +2 -0
- data/test/fixtures/log2.log.gz +0 -0
- data/test/fixtures/log_invalid_mixed_encoding.log +1 -0
- data/test/fixtures/observation_filter.log +5 -0
- data/test/fixtures/unquoted_ugliness.log +2 -0
- data/test/log_parser_test.rb +84 -0
- data/test/observation_compiler_test.rb +216 -0
- data/test/observation_mover_test.rb +135 -0
- data/test/observation_test.rb +114 -0
- data/test/stream_wrapper/gzip_multi_file_test.rb +147 -0
- data/test/stream_wrapper/observation_filter_test.rb +171 -0
- data/test/stream_wrapper/order_blob_splitter_test.rb +129 -0
- data/test/test_helper.rb +23 -0
- metadata +177 -0
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'date'
|
2
|
+
require 'cgi'
|
3
|
+
require 'stringio'
|
4
|
+
|
5
|
+
# Parses a standard web server log file stream and returns a hash with
|
6
|
+
# key/values for each line. Includes the Enumerable interface.
|
7
|
+
class LogParser
|
8
|
+
include Enumerable
|
9
|
+
|
10
|
+
# Support both strings and streams as input.
|
11
|
+
def initialize(input)
|
12
|
+
input = StringIO.new(input) if input.class == String
|
13
|
+
@stream = input
|
14
|
+
end
|
15
|
+
|
16
|
+
# Enumerable interface.
|
17
|
+
def each
|
18
|
+
while(observation = get_next_observation)
|
19
|
+
yield observation
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def get_next_observation
|
24
|
+
line = @stream.gets
|
25
|
+
line && LogParser.parse_line(line)
|
26
|
+
end
|
27
|
+
|
28
|
+
LOG_FORMAT = /([^ ]*) [^ ]* [^ ]* \[([^\]]*)\] "([^"]*)" ([^ ]*)/
|
29
|
+
LOG_DATE_FORMAT = "%d/%b/%Y:%H:%M:%S %z"
|
30
|
+
LOG_KEY_VALUE_FORMAT = /[?&]([^=]+)=([^&]+)/
|
31
|
+
SERVER_ATTRIBUTES = [:ip, :timestamp, :request, :status]
|
32
|
+
|
33
|
+
# Parse one log line and return a hash with all attributes.
|
34
|
+
def self.parse_line(line)
|
35
|
+
return nil if line.strip.empty?
|
36
|
+
|
37
|
+
line =~ LOG_FORMAT
|
38
|
+
result = {}
|
39
|
+
|
40
|
+
# Save ip, timestamp and request.
|
41
|
+
result[:ip] = $1
|
42
|
+
begin
|
43
|
+
result[:timestamp] = DateTime.strptime($2, LOG_DATE_FORMAT)
|
44
|
+
rescue ArgumentError
|
45
|
+
raise ParseError.new("Error while parsing timestamp")
|
46
|
+
end
|
47
|
+
result[:request] = $3
|
48
|
+
result[:status] = $4
|
49
|
+
|
50
|
+
# Extract key/values pairs from the query part of the request.
|
51
|
+
$3.scan(LOG_KEY_VALUE_FORMAT) do |key, value|
|
52
|
+
begin
|
53
|
+
key = CGI.unescape(key).to_sym
|
54
|
+
value = CGI.unescape(value)
|
55
|
+
rescue Encoding::CompatibilityError => e
|
56
|
+
raise ParseError.new("Error while parsing query parameters")
|
57
|
+
end
|
58
|
+
|
59
|
+
if result.has_key? key
|
60
|
+
if result[key].is_a? Array
|
61
|
+
result[key] << value
|
62
|
+
else
|
63
|
+
result[key] = [result[key], value]
|
64
|
+
end
|
65
|
+
else
|
66
|
+
result[key] = value
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
return result
|
71
|
+
rescue ParseError
|
72
|
+
raise
|
73
|
+
rescue
|
74
|
+
raise ParseError.new("Unknown parsing error")
|
75
|
+
end
|
76
|
+
class ParseError < StandardError ; end
|
77
|
+
end
|
78
|
+
|
79
|
+
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'time'
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
module MockupLog
|
5
|
+
|
6
|
+
# Create a custom log file string.
|
7
|
+
def mockup_log lines_data
|
8
|
+
template_lines = {
|
9
|
+
:visit_page => %Q'72.211.248.18 - - [02/Mar/2009:06:58:12 +0100] "GET /log.gif?a=Mozilla%2F5.0%20(Macintosh%3B%20U%3B%20Intel%20Mac%20OS%20X%2010_5_5%3B%20en-us)%20AppleWebKit%2F525.27.1%20(KHTML%2C%20like%20Gecko)%20Version%2F3.2.1%20Safari%2F525.27.1&aid=xroads&l=en-us&n=netscape&o=visit_page&p=macintel&r=http%3A%2F%2Fdesignhousestockholm.xroads.se%2FPortal%2FProducts%2FProductGroup.asp%3Fclick%3D1%26pageIndex%3D14%26type%3Dgeneric&s=1920x1200&sid=DHS_Shop&t=Product&u=http%3A%2F%2Fdesignhousestockholm.xroads.se%2FPortal%2FProducts%2FProduct.asp%3FitemId%3D6005&uid=1235973461091911420&x=35412&z=480& HTTP/1.1" 200 35 "http://designhousestockholm.xroads.se/Portal/Products/Product.asp?itemId=6005" "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_5; en-us) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1" "-" "-"\n',
|
10
|
+
:view_item => %Q'72.211.248.18 - - [02/Mar/2009:06:59:32 +0100] "GET /log.gif?_current_price=US%2471&_image=..%2F..%2FArchive%2FImages%2FWebshop%2FProducts%2FDesignhouseStockholm%2F1132-1000_stor.jpg&_item_id=6005&_sku=HHAW09%20Shelter%20Pants&_thumbnail=..%2F..%2FArchive%2FImages%2FWebshop%2FProducts%2FDesignhouseStockholm%2F1132-1000_liten.jpg&_title=%C2%A0PLEECE%20-%20HOOD&_url=http%3A%2F%2Fdesignhousestockholm.xroads.se%2FPortal%2FProducts%2FProduct.asp%3FitemId%3D6005&a=Mozilla%2F5.0%20(Macintosh%3B%20U%3B%20Intel%20Mac%20OS%20X%2010_5_5%3B%20en-us)%20AppleWebKit%2F525.27.1%20(KHTML%2C%20like%20Gecko)%20Version%2F3.2.1%20Safari%2F525.27.1&aid=xroads&l=en-us&n=netscape&o=view_item&p=macintel&r=http%3A%2F%2Fdesignhousestockholm.xroads.se%2FPortal%2FProducts%2FProductGroup.asp%3Fclick%3D1%26pageIndex%3D14%26type%3Dgeneric&s=1920x1200&sid=DHS_Shop&t=Product&u=http%3A%2F%2Fdesignhousestockholm.xroads.se%2FPortal%2FProducts%2FProduct.asp%3FitemId%3D6005&uid=1235973461091911420&x=35412&z=480& HTTP/1.1" 200 35 "http://designhousestockholm.xroads.se/Portal/Products/Product.asp?itemId=6005" "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_5; en-us) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1" "-" "-"\n',
|
11
|
+
:pick_item => %Q'72.211.248.18 - - [02/Mar/2009:07:00:28 +0100] "GET /log.gif?_item_id=6004&a=Mozilla%2F5.0%20(Macintosh%3B%20U%3B%20Intel%20Mac%20OS%20X%2010_5_5%3B%20en-us)%20AppleWebKit%2F525.27.1%20(KHTML%2C%20like%20Gecko)%20Version%2F3.2.1%20Safari%2F525.27.1&aid=xroads&e=click&l=en-us&n=netscape&o=pick_item&p=macintel&r=http%3A%2F%2Fdesignhousestockholm.xroads.se%2FPortal%2FProducts%2FProductGroup.asp%3Fclick%3D1%26pageIndex%3D14%26type%3Dgeneric&s=1920x1200&sid=DHS_Shop&t=Product&u=http%3A%2F%2Fdesignhousestockholm.xroads.se%2FPortal%2FProducts%2FProduct.asp%3FitemId%3D6004&uid=1235973461091911420&x=98033&z=480& HTTP/1.1" 200 35 "http://designhousestockholm.xroads.se/Portal/Products/Product.asp?itemId=6004" "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_5; en-us) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1" "-" "-"\n',
|
12
|
+
:buy_item => %Q'72.211.248.18 - - [02/Mar/2009:07:01:28 +0100] "GET /log.gif?_current_price=1250&_order_id=13945&_quantity=1&_sku=FERTILITETSMONITOR&_title=Fertilitetsmonitor+fr%C3%A5n+Clearblue&a=Mozilla%2F4.0%20(compatible%3B%20MSIE%207.0%3B%20Windows%20NT%205.1%3B%20.NET%20CLR%201.1.4322%3B%20.NET%20CLR%202.0.50727%3B%20.NET%20CLR%203.0.4506.2152%3B%20.NET%20CLR%203.5.30729)&aid=jetshop&l=sv&n=microsoft%20internet%20explorer&o=buy_item&p=win32&s=1280x960&sid=www.medistore.se&t=H%C3%A4r%20hittar%20du%20sjukv%C3%A5rdsprodukter%2C%20blodtrycksm%C3%A4tare%2C%20f%C3%B6rsta%20hj%C3%A4lpen%20och%20pulsklockor&u=https%3A%2F%2Fwww.medistore.se%2FOrderDetailsConfirmed.aspx&uid=1268173983791130948&x=68410&z=-60& HTTP/1.1" 200 35 "http://designhousestockholm.xroads.se/Portal/Products/Product.asp?itemId=6004" "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_5; en-us) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1" "-" "-"\n',
|
13
|
+
:heed_recommendation => %Q'72.211.248.18 - - [02/Mar/2009:07:01:28 +0100] "GET /log.gif?_current_price=189%2C00%20kr&_item_id=1441&_rec_for=2407&_rec_type=viewed_this_also_viewed&_recommended_ids%5B0%5D=1441&_recommended_ids%5B1%5D=960&_recommended_ids%5B2%5D=535&_url=http%3A%2F%2Fwww.shirtstore.se%2Fits-on-like-donkey-kong-p-1441.aspx&a=Mozilla%2F4.0%20(compatible%3B%20MSIE%208.0%3B%20Windows%20NT%206.1%3B%20Trident%2F4.0%3B%20GTB6.4%3B%20SLCC2%3B%20.NET%20CLR%202.0.50727%3B%20.NET%20CLR%203.5.30729%3B%20.NET%20CLR%203.0.30729%3B%20Media%20Center%20PC%206.0)&aid=jetshop&e=click&l=sv&n=microsoft%20internet%20explorer&o=heed_recommendation&p=win32&r=http%3A%2F%2Fwww.shirtstore.se%2Ftshirts-c-491-1.aspx&s=1280x800&sid=www.shirtstore.se&t=The%20Hulk%20Distressed&u=http%3A%2F%2Fwww.shirtstore.se%2Fthe-hulk-distressed-p-2407-c-491.aspx&uid=1266087502030697666&x=55724&z=-60& HTTP/1.1" 200 35 "http://designhousestockholm.xroads.se/Portal/Products/Product.asp?itemId=6004" "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_5; en-us) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1" "-" "-"\n',
|
14
|
+
}
|
15
|
+
log = ""
|
16
|
+
lines_data.each do |line_data|
|
17
|
+
line_data = {:aid => "xroads", :sid => "Fortum", :uid => "12345", :timestamp => Time.parse("2009-03-02 07:01:28")}.merge(line_data)
|
18
|
+
observation_type = line_data.delete(:o) || raise("Missing observation type attribute (:o => 'xx') in line data.")
|
19
|
+
raise "Has no template for this observation type" unless template_lines[observation_type.to_sym]
|
20
|
+
# Construct the log line.
|
21
|
+
line = template_lines[observation_type.to_sym].dup
|
22
|
+
line_data.each do |key, value|
|
23
|
+
case key
|
24
|
+
when :timestamp then
|
25
|
+
line.sub!(/\[[^\]]+?\]/, value.nil? ? '' : value.strftime('[%d/%b/%Y:%H:%M:%S +0000]'))
|
26
|
+
when :ip then
|
27
|
+
line.sub!(/^\d+\.\d+\.\d+\.\d+/, value)
|
28
|
+
else
|
29
|
+
value = URI.escape(value, /[^-_.!~*'()a-zA-Z\d]/n)
|
30
|
+
line.gsub!(Regexp.new("#{key.to_s}=[^&]+&", true), "#{key.to_s}=#{value}&")
|
31
|
+
end
|
32
|
+
end
|
33
|
+
log << line
|
34
|
+
end
|
35
|
+
log
|
36
|
+
end
|
37
|
+
|
38
|
+
def mockup_log_file(path, lines_data)
|
39
|
+
File.open(path, "w") do |file|
|
40
|
+
file.write(mockup_log(lines_data))
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
@@ -0,0 +1,162 @@
|
|
1
|
+
require 'uri'
|
2
|
+
require 'log_parser'
|
3
|
+
|
4
|
+
class Observation
|
5
|
+
|
6
|
+
BASIC_ATTRIBUTES_NAMES = {
|
7
|
+
:aid => :account_id,
|
8
|
+
:sid => :shop_id,
|
9
|
+
:uid => :user_id,
|
10
|
+
:u => :document_url,
|
11
|
+
:r => :referrer,
|
12
|
+
:t => :document_title,
|
13
|
+
:s => :screen_resolution,
|
14
|
+
:l => :browser_language,
|
15
|
+
:p => :platform,
|
16
|
+
:a => :user_agent,
|
17
|
+
:h => :history_count,
|
18
|
+
:n => :navigator_name,
|
19
|
+
:z => :time_zone_offset,
|
20
|
+
:x => :seed,
|
21
|
+
:o => :observation_type
|
22
|
+
}
|
23
|
+
|
24
|
+
OBSERVATION_ATTRIBUTES_NAMES = {
|
25
|
+
:_item_id => :item_id,
|
26
|
+
:_rec_for => :rec_for,
|
27
|
+
:_title => :title,
|
28
|
+
:_url => :url,
|
29
|
+
:_basket_url => :basket_url,
|
30
|
+
:_description => :description,
|
31
|
+
:_normal_price => :normal_price,
|
32
|
+
:_current_price => :current_price,
|
33
|
+
:_thumbnail => :thumbnail,
|
34
|
+
:_image => :image,
|
35
|
+
:_stock => :stock,
|
36
|
+
:_expires => :expires,
|
37
|
+
:_order_blob => :order_blob,
|
38
|
+
:_order_id => :order_id,
|
39
|
+
:_sku => :sku,
|
40
|
+
:_quantity => :quantity,
|
41
|
+
:_tags => :tags,
|
42
|
+
:_user1 => :user1,
|
43
|
+
:_user2 => :user2,
|
44
|
+
:_user3 => :user3,
|
45
|
+
:_user4 => :user4,
|
46
|
+
:_user5 => :user5,
|
47
|
+
:_culture => :culture,
|
48
|
+
:_currency => :currency,
|
49
|
+
:_group_ref => :group_ref,
|
50
|
+
:_with_vat => :with_vat
|
51
|
+
}
|
52
|
+
BASIC_ATTRIBUTES = BASIC_ATTRIBUTES_NAMES.values
|
53
|
+
OBSERVATION_ATTRIBUTES = OBSERVATION_ATTRIBUTES_NAMES.values
|
54
|
+
ITEM_ATTRIBUTES = OBSERVATION_ATTRIBUTES
|
55
|
+
OBSERVATION_TYPE_STRINGS = %w[visit_page view_item pick_item buy_basket buy_item view_recommendation heed_recommendation]
|
56
|
+
OBSERVATION_TYPES = OBSERVATION_TYPE_STRINGS.map { |e| e.to_sym }
|
57
|
+
# Konverterade med ett stort perl-uttryck:
|
58
|
+
# perl -pi -e 's/([&?])_id=/\1_item_id=/g; s/currentprice/current_price/g; s/normalprice/normal_price/g; s/basketurl/basket_url/g; s/([&?])_type=/\1o=/g; s/buyitem/pick_item/g; s/viewitem/view_item/g; s/searchclick/search_pick_item/g; s/sid=bwintest/aid=bwin/g; s/sid=/aid=crossroads&sid=/g; s/aid=bwin/aid=bwin&sid=bwintest/g;' observer.access.log.*
|
59
|
+
|
60
|
+
attr_reader :attributes, :unknown_attributes, :type, :errors
|
61
|
+
|
62
|
+
def initialize(logline_attributes)
|
63
|
+
transfer_attributes(logline_attributes.clone)
|
64
|
+
validate_type
|
65
|
+
ensure_correct_urls
|
66
|
+
end
|
67
|
+
|
68
|
+
def valid?
|
69
|
+
@errors.empty?
|
70
|
+
end
|
71
|
+
|
72
|
+
def [](key)
|
73
|
+
@attributes[key]
|
74
|
+
end
|
75
|
+
|
76
|
+
def view_item?
|
77
|
+
@type == :view_item
|
78
|
+
end
|
79
|
+
|
80
|
+
def pick_item?
|
81
|
+
@type == :pick_item
|
82
|
+
end
|
83
|
+
|
84
|
+
def buy_item?
|
85
|
+
@type == :buy_item
|
86
|
+
end
|
87
|
+
|
88
|
+
private
|
89
|
+
|
90
|
+
# Takes the input hash and moves all known attributes to the @attributes
|
91
|
+
# hash, renaming some of them. Unknown attributes are also saved.
|
92
|
+
def transfer_attributes(logline_attributes)
|
93
|
+
@attributes = {}
|
94
|
+
|
95
|
+
# Transfer basic attributes.
|
96
|
+
(BASIC_ATTRIBUTES_NAMES.to_a + OBSERVATION_ATTRIBUTES_NAMES.to_a).each do |key, new_key|
|
97
|
+
value = logline_attributes.delete(key)
|
98
|
+
@attributes[new_key] = value if value
|
99
|
+
end
|
100
|
+
|
101
|
+
# Transfer server attributes.
|
102
|
+
LogParser::SERVER_ATTRIBUTES.each do |key|
|
103
|
+
value = logline_attributes.delete(key)
|
104
|
+
@attributes[key] = value if value
|
105
|
+
end
|
106
|
+
|
107
|
+
@unknown_attributes = logline_attributes
|
108
|
+
end
|
109
|
+
|
110
|
+
# Ensure that URLs are absolute and don't contain the anchor part.
|
111
|
+
def ensure_correct_urls
|
112
|
+
if @attributes.has_key? :document_url
|
113
|
+
@errors ||= []
|
114
|
+
[:url, :thumbnail, :image, :basket_url].each do |key|
|
115
|
+
begin
|
116
|
+
if @attributes.has_key? key
|
117
|
+
# Ensure that the url is not encoded (it is ok to unencode a url that is not encoded).
|
118
|
+
url = URI.unescape(@attributes[key])
|
119
|
+
# But we store and handle all urls encoded (according to the RFC). URI.join also requires it.
|
120
|
+
url = URI.escape(url)
|
121
|
+
base_url = URI.escape(@attributes[:document_url])
|
122
|
+
# Ensure absoulte.
|
123
|
+
full_url = URI.join(base_url, url).to_s
|
124
|
+
# Ensure no debug anchor. Just to make sure that #debug urls do not reach Dumbo.
|
125
|
+
full_url.gsub!(/%23debug(on|off)?/, '')
|
126
|
+
@attributes[key] = full_url
|
127
|
+
end
|
128
|
+
rescue
|
129
|
+
@errors << "url_error_on_#{key}".to_sym
|
130
|
+
end
|
131
|
+
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
REQUIRED_ATTRIBUTES = [:user_id, :account_id, :shop_id, :item_id]
|
137
|
+
REQUIRED_ATTRIBUTES_FOR_TYPE = {
|
138
|
+
:buy_basket => [:user_id, :account_id, :shop_id],
|
139
|
+
:buy_item => [:user_id, :account_id, :shop_id, :sku],
|
140
|
+
:visit_page => [:user_id, :account_id, :shop_id],
|
141
|
+
:view_recommendation => [:user_id, :account_id, :shop_id, :rec_for],
|
142
|
+
:heed_recommendation => [:user_id, :account_id, :shop_id, :item_id]
|
143
|
+
}
|
144
|
+
|
145
|
+
def validate_type
|
146
|
+
@type = :unknown
|
147
|
+
@errors ||= []
|
148
|
+
|
149
|
+
# Only handle successfull requests.
|
150
|
+
@errors << :status_not_ok and return unless @attributes[:status] == "200"
|
151
|
+
|
152
|
+
# Only handle types we know of.
|
153
|
+
@errors << :unknown_type and return unless OBSERVATION_TYPE_STRINGS.include? @attributes[:observation_type]
|
154
|
+
@type = @attributes[:observation_type].to_sym
|
155
|
+
|
156
|
+
# For all valid types we require some attributes.
|
157
|
+
(REQUIRED_ATTRIBUTES_FOR_TYPE[@type] || REQUIRED_ATTRIBUTES).each do |a|
|
158
|
+
@errors << "missing_#{a}".to_sym unless @attributes[a] =~ /.+/
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
end
|
@@ -0,0 +1,311 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require 'rubygems'
|
3
|
+
require 'time'
|
4
|
+
require 'date'
|
5
|
+
require 'fileutils'
|
6
|
+
require 'right_aws'
|
7
|
+
require 'rake'
|
8
|
+
require 'yaml'
|
9
|
+
require 'logbox'
|
10
|
+
|
11
|
+
module ObservationCompiler
|
12
|
+
|
13
|
+
class Job
|
14
|
+
|
15
|
+
def initialize(options = {})
|
16
|
+
@raw_logs_bucket = options[:raw_logs_bucket] || "rwdata-logs"
|
17
|
+
@raw_logs_prefix = options[:raw_logs_prefix] || "observer-log-"
|
18
|
+
@processed_logs_path = options[:processed_logs_path] || "local_files"
|
19
|
+
temp_dir = File.exist?("/apps/smartass") ? "/apps/smartass/tmp/" : "/tmp/"
|
20
|
+
@working_path = options[:working_path] ||= "#{temp_dir}observation_compiler/#{Process.pid}"
|
21
|
+
end
|
22
|
+
|
23
|
+
def fetch_and_merge(raw_date_range)
|
24
|
+
# A raw log-file for a date may contain observations from the day before.
|
25
|
+
processed_date_range = (raw_date_range.first-1)..(raw_date_range.last)
|
26
|
+
|
27
|
+
create_working_folders
|
28
|
+
copy_processed_logs_to_working_folder(processed_date_range)
|
29
|
+
unzip_processed(processed_date_range)
|
30
|
+
|
31
|
+
raw_date_range.each do |date|
|
32
|
+
download_raw_logs_to_working_folder(date)
|
33
|
+
unzip_raw(date)
|
34
|
+
merge_raw_into_processed(date)
|
35
|
+
remove_raw_logs(date)
|
36
|
+
end
|
37
|
+
|
38
|
+
ensure
|
39
|
+
sort_processed(processed_date_range)
|
40
|
+
zip_processed(processed_date_range)
|
41
|
+
move_processed_back(processed_date_range)
|
42
|
+
remove_working_path
|
43
|
+
end
|
44
|
+
|
45
|
+
def create_working_folders
|
46
|
+
FileUtils.mkdir_p(raw_working_path)
|
47
|
+
FileUtils.mkdir_p(processed_working_path)
|
48
|
+
end
|
49
|
+
|
50
|
+
def raw_working_path
|
51
|
+
File.join(@working_path, "r")
|
52
|
+
end
|
53
|
+
|
54
|
+
def processed_working_path
|
55
|
+
File.join(@working_path, "p")
|
56
|
+
end
|
57
|
+
|
58
|
+
def processed_log_name(date)
|
59
|
+
"observer-log-#{date.strftime('%Y-%m-%d')}"
|
60
|
+
end
|
61
|
+
|
62
|
+
def raw_logs_prefix(date)
|
63
|
+
@raw_logs_prefix + date.strftime('%Y-%m-%d')
|
64
|
+
end
|
65
|
+
|
66
|
+
def raw_log_paths(date)
|
67
|
+
Dir.glob(File.join(raw_working_path, "#{raw_logs_prefix(date)}*")).sort
|
68
|
+
end
|
69
|
+
|
70
|
+
def remove_working_path
|
71
|
+
FileUtils.rm_r @working_path
|
72
|
+
end
|
73
|
+
|
74
|
+
def copy_processed_logs_to_working_folder(date_range)
|
75
|
+
date_range.each do |date|
|
76
|
+
name = processed_log_name(date) + ".gz"
|
77
|
+
source = File.join(@processed_logs_path, name)
|
78
|
+
destination = File.join(processed_working_path, name)
|
79
|
+
if File.exist? source
|
80
|
+
log "Copying #{name} to working folder"
|
81
|
+
FileUtils.copy(source, destination)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def download_raw_logs_to_working_folder(date)
|
87
|
+
s3 = RightAws::S3.new(*aws_keys)
|
88
|
+
bucket = s3.bucket(@raw_logs_bucket)
|
89
|
+
raise "Unknown bucket: #{@raw_logs_bucket}" if bucket.nil?
|
90
|
+
|
91
|
+
raw_logs = bucket.keys(:prefix => raw_logs_prefix(date))
|
92
|
+
raw_logs.each do |raw_log|
|
93
|
+
log "Getting #{raw_log.name}"
|
94
|
+
File.open(File.join(raw_working_path, raw_log.name), "w") do |file|
|
95
|
+
s3.interface.get(@raw_logs_bucket, raw_log.name) do |chunk|
|
96
|
+
file.write(chunk)
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def unzip_raw(date)
|
103
|
+
log "Unzipping raw logs for #{date}"
|
104
|
+
raw_log_paths(date).each do |raw_log|
|
105
|
+
system "gunzip #{raw_log}" if raw_log.end_with?(".gz")
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def merge_raw_into_processed(date)
|
110
|
+
start_time = Time.now
|
111
|
+
count = 0
|
112
|
+
out_files = {}
|
113
|
+
raw_log_paths(date).each do |raw_log|
|
114
|
+
if raw_log_already_processed?(raw_log)
|
115
|
+
log "Skipping #{raw_log}"
|
116
|
+
next
|
117
|
+
else
|
118
|
+
log "Processing #{raw_log}"
|
119
|
+
end
|
120
|
+
File.foreach raw_log do |line|
|
121
|
+
log_line = LogLine.new(line)
|
122
|
+
next unless log_line.valid?
|
123
|
+
date = log_line.date
|
124
|
+
name = File.join(processed_working_path, processed_log_name(date))
|
125
|
+
out_files[name] ||= File.open(name, "a")
|
126
|
+
out_files[name] << log_line.normalize
|
127
|
+
count += 1
|
128
|
+
end
|
129
|
+
end
|
130
|
+
ensure
|
131
|
+
out_files.each_value { |file| file.close }
|
132
|
+
log "#{count} rader på #{(Time.now - start_time).to_f}s (#{count/(Time.now - start_time).to_f} rader/s)"
|
133
|
+
end
|
134
|
+
|
135
|
+
def raw_log_already_processed?(log_file_name)
|
136
|
+
# Look for the last observation to see if it is already processed.
|
137
|
+
last_observation = `tail -n 1 #{log_file_name}`
|
138
|
+
log_line = LogLine.new(last_observation)
|
139
|
+
return false unless log_line.valid?
|
140
|
+
date = log_line.date
|
141
|
+
processed_file_name = File.join(processed_working_path, processed_log_name(date))
|
142
|
+
File.exist?(processed_file_name) && system("grep", "-qF", last_observation, processed_file_name.chomp)
|
143
|
+
end
|
144
|
+
|
145
|
+
def remove_raw_logs(date)
|
146
|
+
log "Removing raw logs for #{date}"
|
147
|
+
raw_log_paths(date).each do |raw_log|
|
148
|
+
FileUtils.rm(raw_log)
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
def sort_processed(date_range)
|
153
|
+
date_range.each do |date|
|
154
|
+
name = processed_log_name(date)
|
155
|
+
Dir.chdir processed_working_path do
|
156
|
+
next unless File.exist?(name)
|
157
|
+
log "Sorting #{name}"
|
158
|
+
ENV['LC_ALL'] = 'C'
|
159
|
+
ok = system "sort -t: -k2,4 #{name} | uniq > #{name}.sorted"
|
160
|
+
raise "Sort error!" unless ok
|
161
|
+
File.rename("#{name}.sorted", name)
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
def zip_processed(date_range)
|
167
|
+
log "Zipping processed files"
|
168
|
+
date_range.each do |date|
|
169
|
+
name = processed_log_name(date)
|
170
|
+
file = File.join(processed_working_path, name)
|
171
|
+
system "gzip #{file}" if File.exist? file
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
def unzip_processed(date_range)
|
176
|
+
log "Unzipping processed files"
|
177
|
+
date_range.each do |date|
|
178
|
+
name = processed_log_name(date) + ".gz"
|
179
|
+
file = File.join(processed_working_path, name)
|
180
|
+
system "gunzip #{file}" if File.exist? file
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
def move_processed_back(date_range)
|
185
|
+
date_range.each do |date|
|
186
|
+
name = processed_log_name(date) + ".gz"
|
187
|
+
source = File.join(processed_working_path, name)
|
188
|
+
destination = File.join(@processed_logs_path, name)
|
189
|
+
if File.exist? source
|
190
|
+
log "Moving #{name} back"
|
191
|
+
FileUtils.move(source, destination)
|
192
|
+
end
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
def log msg
|
197
|
+
unless defined?(TEST_RUN)
|
198
|
+
puts msg
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
DEFAULT_KEY_FILE = '/etc/s3_key.yml'
|
203
|
+
def aws_keys
|
204
|
+
if File.exists? DEFAULT_KEY_FILE
|
205
|
+
hash = YAML.load_file(DEFAULT_KEY_FILE)
|
206
|
+
[hash[:access_key_id], hash[:secret_access_key]]
|
207
|
+
else
|
208
|
+
access_key_id = ENV['OBSENTER_S3_KEY']
|
209
|
+
secret_access_key = secret_access_key_from_keychain!(access_key_id)
|
210
|
+
[access_key_id, secret_access_key]
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
# These two methods are borrowed from Awsborn
|
215
|
+
def secret_access_key_from_keychain! (key_id)
|
216
|
+
secret = secret_access_key_from_keychain key_id
|
217
|
+
raise "Could not find secret access key for #{key_id}" if secret.to_s == ''
|
218
|
+
secret
|
219
|
+
end
|
220
|
+
|
221
|
+
def secret_access_key_from_keychain (key_id)
|
222
|
+
@credentials ||= {}
|
223
|
+
unless @credentials[key_id]
|
224
|
+
dump = `security -q find-generic-password -a "#{key_id}" -g 2>&1`
|
225
|
+
secret_key = dump[/password: "(.*)"/, 1]
|
226
|
+
@credentials[key_id] = secret_key
|
227
|
+
end
|
228
|
+
@credentials[key_id]
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
class LogLine
|
233
|
+
|
234
|
+
def initialize(line)
|
235
|
+
@line = Logbox::StringEncoder.iconv(line)
|
236
|
+
end
|
237
|
+
|
238
|
+
def valid?
|
239
|
+
normalize
|
240
|
+
true
|
241
|
+
rescue
|
242
|
+
false
|
243
|
+
end
|
244
|
+
|
245
|
+
def normalize
|
246
|
+
normalize_s3_format
|
247
|
+
normalize_apache_format
|
248
|
+
normalize_timestamp
|
249
|
+
@line
|
250
|
+
end
|
251
|
+
|
252
|
+
TIMESTAMP_MATCHER = /(\d+)\/(\w+)\/(\d+):(\d+):(\d+):(\d+)\s([-+]?\d{2})/
|
253
|
+
|
254
|
+
def timestamp
|
255
|
+
unless @timestamp
|
256
|
+
match = @line.match(TIMESTAMP_MATCHER)
|
257
|
+
@timestamp = Time.utc(match[3], match[2], match[1], match[4], match[5], match[6])
|
258
|
+
@timestamp -= match[7].to_i * 3600 # Correct the zone. Works only on whole hours timezones.
|
259
|
+
end
|
260
|
+
@timestamp
|
261
|
+
end
|
262
|
+
|
263
|
+
def date
|
264
|
+
timestamp.send :to_date
|
265
|
+
end
|
266
|
+
|
267
|
+
def to_s
|
268
|
+
@line
|
269
|
+
end
|
270
|
+
|
271
|
+
private
|
272
|
+
|
273
|
+
S = '(\\S+)'
|
274
|
+
# Equivalent to /("(?:\\\\|\\"|[^\\"])*")/
|
275
|
+
Q = '("(?:\\\\\\\\|\\\\"|[^\\\\"])*")'
|
276
|
+
TIMESTAMP = '(\\[[^\\]]+\\])'
|
277
|
+
|
278
|
+
S3_FORMAT = Regexp.new('^' + [S,S,TIMESTAMP,S,S,S,S,S,Q,S,S,S,S,S,S,Q,Q,S].join(' '), 'm')
|
279
|
+
def normalize_s3_format
|
280
|
+
# %Q{owner bucket [16/Mar/2010:16:00:00 +0000] 85.225.221.221 requester requestID operation key "GET /log.gif?_item_id=987&_title=V%C3%A4skor%2FFodral&_url=http%3A%2F%2Fwww.24.se%2Fvaskorfodral-c-987-1.aspx%3Fsortorder%3D1%26direction%3D0%26defps%3D10%26pagesize%3D30%26pagenum%3D3%26useparams%3D0&a=Mozilla%2F5.0%20(Macintosh%3B%20U%3B%20Intel%20Mac%20OS%20X%2010.6%3B%20sv-SE%3B%20rv%3A1.9.2)%20Gecko%2F20100115%20Firefox%2F3.6&aid=jetshop&l=sv-se&n=netscape&o=view_tag&p=macintel&r=http%3A%2F%2Fwww.24.se%2Fvaskorfodral-c-987-1.aspx%3Fsortorder%3D1%26direction%3D0%26defps%3D10%26pagesize%3D30%26pagenum%3D2%26useparams%3D0&s=1280x800&sid=www.24.se&t=V%C3%A4skor%2FFodral&u=http%3A%2F%2Fwww.24.se%2Fvaskorfodral-c-987-1.aspx%3Fsortorder%3D1%26direction%3D0%26defps%3D10%26pagesize%3D30%26pagenum%3D3%26useparams%3D0&uid=1256057859704610385&x=32058&z=-60& HTTP/1.1" 200 - 35 35 6 5 "http://www.24.se/vaskorfodral-c-987-1.aspx?sortorder=1&direction=0&defps=10&pagesize=30&pagenum=3&useparams=0" "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; sv-SE; rv:1.9.2) Gecko/20100115 Firefox/3.6" -}
|
281
|
+
match = @line.match(S3_FORMAT)
|
282
|
+
if match
|
283
|
+
@line = %Q(#{match[4]} - - #{match[3]} #{match[9]} #{match[10]} #{match[12]} #{match[16]} #{match[17]} "-" "-"\n)
|
284
|
+
end
|
285
|
+
@line
|
286
|
+
end
|
287
|
+
|
288
|
+
APACHE_WITHOUT_COOKIES = Regexp.new('^' + [S,S,S,TIMESTAMP,Q,S,S,Q,Q].join(' ') + '$')
|
289
|
+
def normalize_apache_format
|
290
|
+
# Add third party cookies at end if they are not there.
|
291
|
+
# %Q{124.191.88.9 - - [26/May/2009:23:59:50 +0000] "GET /log.gif" "Mozilla/5.0"}
|
292
|
+
match = @line.match(APACHE_WITHOUT_COOKIES)
|
293
|
+
if match
|
294
|
+
@line = %Q(#{match[0]} "-" "-"\n)
|
295
|
+
end
|
296
|
+
@line
|
297
|
+
end
|
298
|
+
|
299
|
+
def normalize_timestamp
|
300
|
+
# 12/Apr/2010:09:07:23 +0200 => 12/Apr/2010:07:07:23 +0000
|
301
|
+
match = @line.match(/^(.*?\[)([^\]]+)(\].+)$/m)
|
302
|
+
unless match[2].end_with?('0000')
|
303
|
+
@line = "#{match[1]}#{timestamp.strftime('%d/%b/%Y:%H:%M:%S +0000')}#{match[3]}"
|
304
|
+
end
|
305
|
+
@line
|
306
|
+
end
|
307
|
+
|
308
|
+
end
|
309
|
+
|
310
|
+
end
|
311
|
+
|