skinny_jeans 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +5 -0
- data/README.rdoc +37 -0
- data/Rakefile +15 -0
- data/VERSION +1 -0
- data/lib/skinny_jeans.rb +157 -0
- data/skinny_jeans.gemspec +47 -0
- metadata +103 -0
data/README.rdoc
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
= SKINNY JEANS LOG PARSING WITH RUBY & SQLITE FOR HIPSTERS
|
2
|
+
http://img696.imageshack.us/img696/75/skinnys3.jpg
|
3
|
+
|
4
|
+
== EXAMPLE
|
5
|
+
|
6
|
+
* your log file has lines that look like
|
7
|
+
|
8
|
+
0.0.0.0 - - [01/Oct/2010:00:00:00 -0700] "GET /posts/my-first-post HTTP/1.1" 200 1337 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-"
|
9
|
+
0.0.0.0 - - [01/Oct/2010:00:00:01 -0700] "GET /posts/my-first-post HTTP/1.1" 200 1337 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-"
|
10
|
+
0.0.0.0 - - [01/Oct/2010:00:00:03 -0700] "GET /posts/my-first-post HTTP/1.1" 200 1337 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-"
|
11
|
+
0.0.0.0 - - [02/Oct/2010:00:00:03 -0700] "GET /posts/my-first-post HTTP/1.1" 200 1337 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-"
|
12
|
+
|
13
|
+
* then you get 2 SQL rows that looks like:
|
14
|
+
2010-10-01, my-first-post, 3
|
15
|
+
2010-10-02, my-first-post, 1
|
16
|
+
* note the date columns truncate timestamp, so the days are in whatever timezone your log file reports in
|
17
|
+
|
18
|
+
|
19
|
+
== WHY?
|
20
|
+
* so you can query a database by date and path and get pageviews and have that data stored CHEAP
|
21
|
+
* because i couldn't find anything simpler and Google Analytics is limited to 10,000 API requests per day
|
22
|
+
|
23
|
+
|
24
|
+
== USAGE
|
25
|
+
sj = SkinnyJeans::execute(logfile_path = "access.log", sqlite_skinny_jeans = "sqlite_skinny_jeans.db", path_regexp = /\s\/posts\/(.*)\sHTTP/, date_regexp = /\[(\d.*\d)\]/)
|
26
|
+
sj.pageview.where("date = '2010-10-01' and path = 'my-first-post'")
|
27
|
+
=> #<SkinnyJeans::Pageview id: 1, date: "2010-10-01", path: "my-first-post", pageview_count: 3>
|
28
|
+
1. NOTE: for now *you have to monkey patch the SkinnyJeans#parse_string_as_date*
|
29
|
+
2. Parse oldest logs first, then run regularly against your main log, let logrotate handle the rest (skinny_jeans remembers where it left off)
|
30
|
+
3. ASSUMES reading log files in ascending order, keeps track of last line read so you could put it on a scheduler or cron job
|
31
|
+
4. access the 2 activerecord classes, sj.pageview (returns Pageview class), and sj.update
|
32
|
+
5. enjoy the skinny jeans
|
33
|
+
|
34
|
+
|
35
|
+
== PERFORMANCE
|
36
|
+
* it parses 100,000 lines in < 2.5 seconds
|
37
|
+
* persists 1,000 requests with 2 compound indexes in 15 seconds, or 10 seconds with home_run c extension
|
data/Rakefile
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'rake'
|
2
|
+
begin
|
3
|
+
require 'jeweler'
|
4
|
+
Jeweler::Tasks.new do |s|
|
5
|
+
s.name = "skinny_jeans"
|
6
|
+
s.summary = "Fast webserver log parser for persisting daily pageviews per path to sqlite"
|
7
|
+
s.email = "jonathan.otto@gmail.com"
|
8
|
+
s.homepage = "http://github.com/jotto/skinny_jeans"
|
9
|
+
s.authors = ["Jonathan Otto"]
|
10
|
+
s.add_dependency 'sqlite3-ruby', '>= 1.2.4'
|
11
|
+
s.add_dependency 'activerecord', '>= 2.3.8'
|
12
|
+
end
|
13
|
+
rescue LoadError
|
14
|
+
puts "Jeweler, or one of its dependencies, is not available. Install it with: sudo gem install jeweler"
|
15
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.2.2
|
data/lib/skinny_jeans.rb
ADDED
@@ -0,0 +1,157 @@
|
|
1
|
+
require 'time'
|
2
|
+
require 'benchmark'
|
3
|
+
require 'rubygems'
|
4
|
+
require 'sqlite3'
|
5
|
+
require 'active_record'
|
6
|
+
require 'zlib'
|
7
|
+
# require 'home_run'
|
8
|
+
|
9
|
+
class SkinnyJeans
|
10
|
+
|
11
|
+
def self.execute(logfile_path, sqlite_db_path, path_regexp, date_regexp)
|
12
|
+
self.new(logfile_path, sqlite_db_path, path_regexp, date_regexp).execute
|
13
|
+
end
|
14
|
+
|
15
|
+
attr_accessor :hash_of_dates, :last_pageview_at
|
16
|
+
|
17
|
+
def initialize(logfile_path, sqlite_db_path, path_regexp, date_regexp)
|
18
|
+
@logfile_path, @sqlite_db_path, @path_regexp, @date_regexp = [logfile_path, sqlite_db_path, path_regexp, date_regexp]
|
19
|
+
@is_gzipped = !logfile_path.to_s[/gz/].nil?
|
20
|
+
prepare_db
|
21
|
+
@hash_of_dates = {}
|
22
|
+
@last_datetime = nil
|
23
|
+
end
|
24
|
+
|
25
|
+
def prepare_db
|
26
|
+
# create database if necessary
|
27
|
+
SQLite3::Database.new(@sqlite_db_path)
|
28
|
+
ActiveRecord::Base.establish_connection(:adapter => 'sqlite3', :database => @sqlite_db_path)
|
29
|
+
# create tables if necessary
|
30
|
+
if !Pageview.table_exists?
|
31
|
+
ActiveRecord::Base.connection.create_table(:pageviews) do |t|
|
32
|
+
t.column :date, :date
|
33
|
+
t.column :path, :string
|
34
|
+
t.column :pageview_count, :integer
|
35
|
+
end
|
36
|
+
# flow tight like skinny jeans with these compound indexes
|
37
|
+
ActiveRecord::Base.connection.add_index(:pageviews, [:date, :path], :name => "date_path_index")
|
38
|
+
ActiveRecord::Base.connection.add_index(:pageviews, [:date, :pageview_count], :name => "date_pageview_count_index")
|
39
|
+
end
|
40
|
+
if !Update.table_exists?
|
41
|
+
ActiveRecord::Base.connection.create_table(:updates) do |t|
|
42
|
+
t.column :last_pageview_at, :timestamp
|
43
|
+
t.column :lines_parsed, :integer
|
44
|
+
t.column :last_line_parsed, :string
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def execute
|
50
|
+
|
51
|
+
lines_parsed = 0
|
52
|
+
last_line_parsed, last_pageview_at, lineno_of_last_line_parsed = [nil,nil,nil]
|
53
|
+
# last_update = Update.order("id DESC").limit(1).first
|
54
|
+
last_update = Update.find(:first, :order => "id DESC", :limit => 1)
|
55
|
+
|
56
|
+
# see if the last_line_parsed parsed exists in the current log file
|
57
|
+
# if it doesnt exist, we'll simply read anything with a timestamp greater than last_pageview_at
|
58
|
+
if last_update
|
59
|
+
last_pageview_at, last_line_parsed = last_update.last_pageview_at, last_update.last_line_parsed
|
60
|
+
file_reader do |line, lineno|
|
61
|
+
if line == last_line_parsed
|
62
|
+
lineno_of_last_line_parsed = lineno
|
63
|
+
break
|
64
|
+
end
|
65
|
+
end
|
66
|
+
puts "last line parsed was\n#{last_line_parsed}\nat lineno #{lineno_of_last_line_parsed}"
|
67
|
+
end
|
68
|
+
|
69
|
+
realtime = Benchmark.realtime do
|
70
|
+
date_path_pairs_array = []
|
71
|
+
lineno = -1
|
72
|
+
|
73
|
+
file_reader do |line, index|
|
74
|
+
lineno += 1
|
75
|
+
next if lineno_of_last_line_parsed && lineno < lineno_of_last_line_parsed
|
76
|
+
|
77
|
+
path_match = line[@path_regexp, 1]
|
78
|
+
next if path_match.nil?
|
79
|
+
date_match = line[@date_regexp, 1]
|
80
|
+
next if date_match.nil?
|
81
|
+
time_object = parse_string_as_date(date_match)
|
82
|
+
|
83
|
+
next if lineno_of_last_line_parsed.nil? && !last_pageview_at.nil? && time_object < last_pageview_at
|
84
|
+
|
85
|
+
insert_or_increment([time_object,path_match])
|
86
|
+
last_line_parsed = line
|
87
|
+
lines_parsed += 1
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
puts "completed parsing in #{realtime}"
|
92
|
+
|
93
|
+
persisted = 0
|
94
|
+
realtime = Benchmark.realtime do
|
95
|
+
hash_of_dates.each do |date, hash_of_paths|
|
96
|
+
hash_of_paths.keys.each do |path|
|
97
|
+
pv = Pageview.find_or_create_by_date_and_path(date, path)
|
98
|
+
pv.pageview_count ||= 0
|
99
|
+
pv.pageview_count += hash_of_paths[path]
|
100
|
+
pv.save!
|
101
|
+
persisted += 1
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
puts "completed persistence in #{realtime}"
|
107
|
+
|
108
|
+
Update.create!({:last_pageview_at => self.last_pageview_at, :lines_parsed => lines_parsed, :last_line_parsed => last_line_parsed})
|
109
|
+
|
110
|
+
puts "total records in DB: #{Pageview.count}\nlines parsed this round: #{lines_parsed}\nlines persisted this round:#{persisted}\ntotal SkinnyJeans executions since inception: #{Update.count}"
|
111
|
+
|
112
|
+
return self
|
113
|
+
|
114
|
+
end
|
115
|
+
|
116
|
+
def file_reader
|
117
|
+
if @is_gzipped
|
118
|
+
lineno = 0
|
119
|
+
Zlib::GzipReader.open(@logfile_path){|line|yield([line.read,lineno]);lineno+=1}
|
120
|
+
else
|
121
|
+
File.new(@logfile_path, "r").each_with_index{|line, lineno| yield([line,lineno])}
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
def pageview;get_ar_class(Pageview);end
|
126
|
+
def update;get_ar_class(Update);end
|
127
|
+
|
128
|
+
def get_ar_class(klass)
|
129
|
+
begin;return(klass);rescue(ActiveRecord::ConnectionNotEstablished);prepare_db;end
|
130
|
+
end
|
131
|
+
|
132
|
+
private
|
133
|
+
|
134
|
+
# return a ruby Time object
|
135
|
+
def parse_string_as_date(date_string = "02/Oct/2010:11:17:44 -0700")
|
136
|
+
day,month,year,hour,minute,seconds,zone = date_string.scan(/(\d{1,2})\/(\w{3,5})\/(\d{4}):(\d\d):(\d\d):(\d\d)\s(-?\d{3,4})/).flatten
|
137
|
+
Time.parse("#{year}-#{month}-#{day} #{hour}:#{minute}:#{seconds} #{zone}")
|
138
|
+
end
|
139
|
+
|
140
|
+
def insert_or_increment(date_path_pair)
|
141
|
+
datetime, path = date_path_pair
|
142
|
+
date = datetime.strftime(("%Y-%m-%d"))
|
143
|
+
hash_of_dates[date] ||= {}
|
144
|
+
hash_of_dates[date][path] ||= 0
|
145
|
+
hash_of_dates[date][path] += 1
|
146
|
+
@last_pageview_at = datetime
|
147
|
+
end
|
148
|
+
|
149
|
+
class Pageview < ActiveRecord::Base
|
150
|
+
end
|
151
|
+
class Update < ActiveRecord::Base
|
152
|
+
end
|
153
|
+
|
154
|
+
|
155
|
+
end
|
156
|
+
|
157
|
+
# SkinnyJeans::execute(ARGV.first) if "#{$0}".gsub(/.*\//,"") == "skinny_jeans.rb"
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{skinny_jeans}
|
8
|
+
s.version = "0.2.2"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Jonathan Otto"]
|
12
|
+
s.date = %q{2010-10-04}
|
13
|
+
s.email = %q{jonathan.otto@gmail.com}
|
14
|
+
s.extra_rdoc_files = [
|
15
|
+
"README.rdoc"
|
16
|
+
]
|
17
|
+
s.files = [
|
18
|
+
".gitignore",
|
19
|
+
"README.rdoc",
|
20
|
+
"Rakefile",
|
21
|
+
"VERSION",
|
22
|
+
"lib/skinny_jeans.rb",
|
23
|
+
"skinny_jeans.gemspec"
|
24
|
+
]
|
25
|
+
s.homepage = %q{http://github.com/jotto/skinny_jeans}
|
26
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
27
|
+
s.require_paths = ["lib"]
|
28
|
+
s.rubygems_version = %q{1.3.7}
|
29
|
+
s.summary = %q{Fast webserver log parser for persisting daily pageviews per path to sqlite}
|
30
|
+
|
31
|
+
if s.respond_to? :specification_version then
|
32
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
33
|
+
s.specification_version = 3
|
34
|
+
|
35
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
36
|
+
s.add_runtime_dependency(%q<sqlite3-ruby>, [">= 1.2.4"])
|
37
|
+
s.add_runtime_dependency(%q<activerecord>, [">= 2.3.8"])
|
38
|
+
else
|
39
|
+
s.add_dependency(%q<sqlite3-ruby>, [">= 1.2.4"])
|
40
|
+
s.add_dependency(%q<activerecord>, [">= 2.3.8"])
|
41
|
+
end
|
42
|
+
else
|
43
|
+
s.add_dependency(%q<sqlite3-ruby>, [">= 1.2.4"])
|
44
|
+
s.add_dependency(%q<activerecord>, [">= 2.3.8"])
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
metadata
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: skinny_jeans
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 19
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 2
|
9
|
+
- 2
|
10
|
+
version: 0.2.2
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Jonathan Otto
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2010-10-04 00:00:00 -05:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: sqlite3-ruby
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 23
|
30
|
+
segments:
|
31
|
+
- 1
|
32
|
+
- 2
|
33
|
+
- 4
|
34
|
+
version: 1.2.4
|
35
|
+
type: :runtime
|
36
|
+
version_requirements: *id001
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
|
+
name: activerecord
|
39
|
+
prerelease: false
|
40
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ">="
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
hash: 19
|
46
|
+
segments:
|
47
|
+
- 2
|
48
|
+
- 3
|
49
|
+
- 8
|
50
|
+
version: 2.3.8
|
51
|
+
type: :runtime
|
52
|
+
version_requirements: *id002
|
53
|
+
description:
|
54
|
+
email: jonathan.otto@gmail.com
|
55
|
+
executables: []
|
56
|
+
|
57
|
+
extensions: []
|
58
|
+
|
59
|
+
extra_rdoc_files:
|
60
|
+
- README.rdoc
|
61
|
+
files:
|
62
|
+
- .gitignore
|
63
|
+
- README.rdoc
|
64
|
+
- Rakefile
|
65
|
+
- VERSION
|
66
|
+
- lib/skinny_jeans.rb
|
67
|
+
- skinny_jeans.gemspec
|
68
|
+
has_rdoc: true
|
69
|
+
homepage: http://github.com/jotto/skinny_jeans
|
70
|
+
licenses: []
|
71
|
+
|
72
|
+
post_install_message:
|
73
|
+
rdoc_options:
|
74
|
+
- --charset=UTF-8
|
75
|
+
require_paths:
|
76
|
+
- lib
|
77
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
78
|
+
none: false
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
hash: 3
|
83
|
+
segments:
|
84
|
+
- 0
|
85
|
+
version: "0"
|
86
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
87
|
+
none: false
|
88
|
+
requirements:
|
89
|
+
- - ">="
|
90
|
+
- !ruby/object:Gem::Version
|
91
|
+
hash: 3
|
92
|
+
segments:
|
93
|
+
- 0
|
94
|
+
version: "0"
|
95
|
+
requirements: []
|
96
|
+
|
97
|
+
rubyforge_project:
|
98
|
+
rubygems_version: 1.3.7
|
99
|
+
signing_key:
|
100
|
+
specification_version: 3
|
101
|
+
summary: Fast webserver log parser for persisting daily pageviews per path to sqlite
|
102
|
+
test_files: []
|
103
|
+
|