skinny-jeans 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +14 -14
- data/VERSION +1 -1
- data/lib/skinny_jeans.rb +110 -82
- data/skinny-jeans.gemspec +2 -2
- metadata +4 -4
data/README.rdoc
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
= SKINNY JEANS LOG PARSING WITH RUBY & SQLITE FOR HIPSTERS
|
2
2
|
http://img696.imageshack.us/img696/75/skinnys3.jpg
|
3
3
|
|
4
|
-
==
|
4
|
+
== EXAMPLE
|
5
5
|
|
6
6
|
* your log file has lines that look like
|
7
7
|
|
@@ -10,9 +10,10 @@ http://img696.imageshack.us/img696/75/skinnys3.jpg
|
|
10
10
|
0.0.0.0 - - [01/Oct/2010:00:00:03 -0700] "GET /posts/my-first-post HTTP/1.1" 200 1337 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-"
|
11
11
|
0.0.0.0 - - [02/Oct/2010:00:00:03 -0700] "GET /posts/my-first-post HTTP/1.1" 200 1337 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-"
|
12
12
|
|
13
|
-
* then you get
|
13
|
+
* then you get 2 SQL rows that looks like:
|
14
14
|
2010-10-01, my-first-post, 3
|
15
15
|
2010-10-02, my-first-post, 1
|
16
|
+
* note the date columns truncate timestamp, so the days are in whatever timezone your log file reports in
|
16
17
|
|
17
18
|
|
18
19
|
== WHY?
|
@@ -20,18 +21,17 @@ http://img696.imageshack.us/img696/75/skinnys3.jpg
|
|
20
21
|
* because i couldn't find anything simpler and Google Analytics is limited to 10,000 API requests per day
|
21
22
|
|
22
23
|
|
23
|
-
==
|
24
|
-
SkinnyJeans::execute(logfile_path = "access.log", sqlite_skinny_jeans = "sqlite_skinny_jeans.db", path_regexp = /\s\/posts\/(.*)\sHTTP/, date_regexp = /\[(\d.*\d)\]/)
|
25
|
-
|
26
|
-
|
24
|
+
== USAGE
|
25
|
+
sj = SkinnyJeans::execute(logfile_path = "access.log", sqlite_skinny_jeans = "sqlite_skinny_jeans.db", path_regexp = /\s\/posts\/(.*)\sHTTP/, date_regexp = /\[(\d.*\d)\]/)
|
26
|
+
sj.pageview.where("date = '2010-10-01' and path = 'my-first-post'")
|
27
|
+
=> #<SkinnyJeans::Pageview id: 1, date: "2010-10-01", path: "my-first-post", pageview_count: 3>
|
28
|
+
1. NOTE: for now *you have to monkey patch the SkinnyJeans#parse_string_as_date*
|
29
|
+
2. Parse oldest logs first, then run regularly against your main log, let logrotate handle the rest (skinny_jeans remembers where it left off)
|
30
|
+
3. ASSUMES reading log files in ascending order, keeps track of last line read so you could put it on a scheduler or cron job
|
31
|
+
4. access the 2 activerecord classes, sj.pageview (returns Pageview class), and sj.update
|
32
|
+
5. enjoy the skinny jeans
|
27
33
|
|
28
34
|
|
29
|
-
==
|
35
|
+
== PERFORMANCE
|
30
36
|
* it parses 100,000 lines in < 2.5 seconds
|
31
|
-
* persists 1,000 requests with 2 compound indexes in 10 seconds
|
32
|
-
|
33
|
-
* parse a webserver's log file to aggregate paths by DAY with pageview counts
|
34
|
-
|
35
|
-
* creates sqlite database with columns: date, path, pageview_count
|
36
|
-
|
37
|
-
* ASSUMES reading log files in ascending order, keeps track of last line read so you could put it on a scheduler or cron job
|
37
|
+
* persists 1,000 requests with 2 compound indexes in 15 seconds, or 10 seconds with home_run c extension
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/lib/skinny_jeans.rb
CHANGED
@@ -3,115 +3,137 @@ require 'benchmark'
|
|
3
3
|
require 'rubygems'
|
4
4
|
require 'sqlite3'
|
5
5
|
require 'active_record'
|
6
|
+
require 'zlib'
|
6
7
|
# require 'home_run'
|
7
8
|
|
8
|
-
class Pageview < ActiveRecord::Base
|
9
|
-
end
|
10
|
-
class Update < ActiveRecord::Base
|
11
|
-
end
|
12
|
-
|
13
9
|
class SkinnyJeans
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
10
|
+
|
11
|
+
def self.execute(logfile_path, sqlite_db_path, path_regexp, date_regexp)
|
12
|
+
self.new(logfile_path, sqlite_db_path, path_regexp, date_regexp).execute
|
13
|
+
end
|
14
|
+
|
15
|
+
attr_accessor :hash_of_dates, :last_pageview_at
|
16
|
+
|
17
|
+
def initialize(logfile_path, sqlite_db_path, path_regexp, date_regexp)
|
18
|
+
@logfile_path, @sqlite_db_path, @path_regexp, @date_regexp = [logfile_path, sqlite_db_path, path_regexp, date_regexp]
|
19
|
+
@is_gzipped = !logfile_path.to_s[/gz/].nil?
|
20
|
+
prepare_db
|
21
|
+
@hash_of_dates = {}
|
22
|
+
@last_datetime = nil
|
23
|
+
end
|
24
|
+
|
25
|
+
def prepare_db
|
26
|
+
# create database if necessary
|
27
|
+
SQLite3::Database.new(@sqlite_db_path)
|
28
|
+
ActiveRecord::Base.establish_connection(:adapter => 'sqlite3', :database => @sqlite_db_path)
|
29
|
+
# create tables if necessary
|
30
|
+
if !Pageview.table_exists?
|
31
|
+
ActiveRecord::Base.connection.create_table(:pageviews) do |t|
|
32
|
+
t.column :date, :date
|
33
|
+
t.column :path, :string
|
34
|
+
t.column :pageview_count, :integer
|
30
35
|
end
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
36
|
+
# flow tight like skinny jeans with these compound indexes
|
37
|
+
ActiveRecord::Base.connection.add_index(:pageviews, [:date, :path], :name => "date_path_index")
|
38
|
+
ActiveRecord::Base.connection.add_index(:pageviews, [:date, :pageview_count], :name => "date_pageview_count_index")
|
39
|
+
end
|
40
|
+
if !Update.table_exists?
|
41
|
+
ActiveRecord::Base.connection.create_table(:updates) do |t|
|
42
|
+
t.column :last_pageview_at, :timestamp
|
43
|
+
t.column :lines_parsed, :integer
|
44
|
+
t.column :last_line_parsed, :string
|
37
45
|
end
|
38
46
|
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def execute
|
50
|
+
|
51
|
+
lines_parsed = 0
|
52
|
+
last_line_parsed, last_pageview_at, lineno_of_last_line_parsed = [nil,nil,nil]
|
53
|
+
last_update = Update.order("id DESC").limit(1).first
|
39
54
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
if last_update
|
49
|
-
last_pageview_at, last_line_parsed = last_update.last_pageview_at, last_update.last_line_parsed
|
50
|
-
File.new(logfile_path, "r").each_with_index do |line, lineno|
|
51
|
-
if line == last_line_parsed
|
52
|
-
lineno_of_last_line_parsed = lineno
|
53
|
-
break
|
54
|
-
end
|
55
|
+
# see if the last_line_parsed parsed exists in the current log file
|
56
|
+
# if it doesnt exist, we'll simply read anything with a timestamp greater than last_pageview_at
|
57
|
+
if last_update
|
58
|
+
last_pageview_at, last_line_parsed = last_update.last_pageview_at, last_update.last_line_parsed
|
59
|
+
file_reader do |line, lineno|
|
60
|
+
if line == last_line_parsed
|
61
|
+
lineno_of_last_line_parsed = lineno
|
62
|
+
break
|
55
63
|
end
|
56
|
-
puts "last line parsed was\n#{last_line_parsed}\nat lineno #{lineno_of_last_line_parsed}"
|
57
64
|
end
|
65
|
+
puts "last line parsed was\n#{last_line_parsed}\nat lineno #{lineno_of_last_line_parsed}"
|
66
|
+
end
|
58
67
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
File.new(logfile_path, "r").each_with_index do |line, lineno|
|
63
|
-
next if lineno_of_last_line_parsed && lineno < lineno_of_last_line_parsed
|
68
|
+
realtime = Benchmark.realtime do
|
69
|
+
date_path_pairs_array = []
|
70
|
+
lineno = -1
|
64
71
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
next if date_match.nil?
|
69
|
-
time_object = parse_string_as_date(date_match)
|
72
|
+
file_reader do |line, index|
|
73
|
+
lineno += 1
|
74
|
+
next if lineno_of_last_line_parsed && lineno < lineno_of_last_line_parsed
|
70
75
|
|
71
|
-
|
76
|
+
path_match = line[@path_regexp, 1]
|
77
|
+
next if path_match.nil?
|
78
|
+
date_match = line[@date_regexp, 1]
|
79
|
+
next if date_match.nil?
|
80
|
+
time_object = parse_string_as_date(date_match)
|
72
81
|
|
73
|
-
|
74
|
-
last_line_parsed = line
|
75
|
-
lines_parsed += 1
|
76
|
-
end
|
82
|
+
next if lineno_of_last_line_parsed.nil? && !last_pageview_at.nil? && time_object < last_pageview_at
|
77
83
|
|
78
|
-
|
84
|
+
insert_or_increment([time_object,path_match])
|
85
|
+
last_line_parsed = line
|
86
|
+
lines_parsed += 1
|
79
87
|
end
|
88
|
+
end
|
80
89
|
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
90
|
+
puts "completed parsing in #{realtime}"
|
91
|
+
|
92
|
+
persisted = 0
|
93
|
+
realtime = Benchmark.realtime do
|
94
|
+
hash_of_dates.each do |date, hash_of_paths|
|
95
|
+
hash_of_paths.keys.each do |path|
|
96
|
+
pv = Pageview.find_or_create_by_date_and_path(date, path)
|
97
|
+
pv.pageview_count ||= 0
|
98
|
+
pv.pageview_count += hash_of_paths[path]
|
99
|
+
pv.save!
|
100
|
+
persisted += 1
|
91
101
|
end
|
92
102
|
end
|
93
|
-
|
94
|
-
|
103
|
+
end
|
104
|
+
|
105
|
+
puts "completed persistence in #{realtime}"
|
95
106
|
|
96
|
-
|
107
|
+
Update.create!({:last_pageview_at => self.last_pageview_at, :lines_parsed => lines_parsed, :last_line_parsed => last_line_parsed})
|
97
108
|
|
98
|
-
|
109
|
+
puts "total records in DB: #{Pageview.count}\nlines parsed this round: #{lines_parsed}\nlines persisted this round:#{persisted}\ntotal SkinnyJeans executions since inception: #{Update.count}"
|
99
110
|
|
100
|
-
|
111
|
+
return self
|
112
|
+
|
113
|
+
end
|
101
114
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
115
|
+
def file_reader
|
116
|
+
if @is_gzipped
|
117
|
+
lineno = 0
|
118
|
+
Zlib::GzipReader.open(@logfile_path){|line|yield([line.read,lineno]);lineno+=1}
|
119
|
+
else
|
120
|
+
File.new(@logfile_path, "r").each_with_index{|line, lineno| yield([line,lineno])}
|
106
121
|
end
|
122
|
+
end
|
123
|
+
|
124
|
+
def pageview;get_ar_class(Pageview);end
|
125
|
+
def update;get_ar_class(Update);end
|
107
126
|
|
127
|
+
def get_ar_class(klass)
|
128
|
+
begin;return(klass);rescue(ActiveRecord::ConnectionNotEstablished);prepare_db;end
|
108
129
|
end
|
109
130
|
|
110
|
-
|
131
|
+
private
|
111
132
|
|
112
|
-
|
113
|
-
|
114
|
-
|
133
|
+
# return a ruby Time object
|
134
|
+
def parse_string_as_date(date_string = "02/Oct/2010:11:17:44 -0700")
|
135
|
+
day,month,year,hour,minute,seconds,zone = date_string.scan(/(\d{1,2})\/(\w{3,5})\/(\d{4}):(\d\d):(\d\d):(\d\d)\s(-?\d{3,4})/).flatten
|
136
|
+
Time.parse("#{year}-#{month}-#{day} #{hour}:#{minute}:#{seconds} #{zone}")
|
115
137
|
end
|
116
138
|
|
117
139
|
def insert_or_increment(date_path_pair)
|
@@ -123,6 +145,12 @@ class SkinnyJeans
|
|
123
145
|
@last_pageview_at = datetime
|
124
146
|
end
|
125
147
|
|
148
|
+
class Pageview < ActiveRecord::Base
|
149
|
+
end
|
150
|
+
class Update < ActiveRecord::Base
|
151
|
+
end
|
152
|
+
|
153
|
+
|
126
154
|
end
|
127
155
|
|
128
156
|
# SkinnyJeans::execute(ARGV.first) if "#{$0}".gsub(/.*\//,"") == "skinny_jeans.rb"
|
data/skinny-jeans.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{skinny-jeans}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.2.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Jonathan Otto"]
|
12
|
-
s.date = %q{2010-10-
|
12
|
+
s.date = %q{2010-10-04}
|
13
13
|
s.email = %q{jonathan.otto@gmail.com}
|
14
14
|
s.extra_rdoc_files = [
|
15
15
|
"README.rdoc"
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: skinny-jeans
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 23
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
8
|
+
- 2
|
9
9
|
- 0
|
10
|
-
version: 0.
|
10
|
+
version: 0.2.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jonathan Otto
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-10-
|
18
|
+
date: 2010-10-04 00:00:00 -05:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|