skinny_jeans 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +5 -0
- data/README.rdoc +37 -0
- data/Rakefile +15 -0
- data/VERSION +1 -0
- data/lib/skinny_jeans.rb +157 -0
- data/skinny_jeans.gemspec +47 -0
- metadata +103 -0
data/README.rdoc
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
= SKINNY JEANS LOG PARSING WITH RUBY & SQLITE FOR HIPSTERS
|
|
2
|
+
http://img696.imageshack.us/img696/75/skinnys3.jpg
|
|
3
|
+
|
|
4
|
+
== EXAMPLE
|
|
5
|
+
|
|
6
|
+
* your log file has lines that look like
|
|
7
|
+
|
|
8
|
+
0.0.0.0 - - [01/Oct/2010:00:00:00 -0700] "GET /posts/my-first-post HTTP/1.1" 200 1337 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-"
|
|
9
|
+
0.0.0.0 - - [01/Oct/2010:00:00:01 -0700] "GET /posts/my-first-post HTTP/1.1" 200 1337 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-"
|
|
10
|
+
0.0.0.0 - - [01/Oct/2010:00:00:03 -0700] "GET /posts/my-first-post HTTP/1.1" 200 1337 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-"
|
|
11
|
+
0.0.0.0 - - [02/Oct/2010:00:00:03 -0700] "GET /posts/my-first-post HTTP/1.1" 200 1337 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-"
|
|
12
|
+
|
|
13
|
+
* then you get 2 SQL rows that looks like:
|
|
14
|
+
2010-10-01, my-first-post, 3
|
|
15
|
+
2010-10-02, my-first-post, 1
|
|
16
|
+
* note the date columns truncate timestamp, so the days are in whatever timezone your log file reports in
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
== WHY?
|
|
20
|
+
* so you can query a database by date and path and get pageviews and have that data stored CHEAP
|
|
21
|
+
* because i couldn't find anything simpler and Google Analytics is limited to 10,000 API requests per day
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
== USAGE
|
|
25
|
+
sj = SkinnyJeans::execute(logfile_path = "access.log", sqlite_skinny_jeans = "sqlite_skinny_jeans.db", path_regexp = /\s\/posts\/(.*)\sHTTP/, date_regexp = /\[(\d.*\d)\]/)
|
|
26
|
+
sj.pageview.where("date = '2010-10-01' and path = 'my-first-post'")
|
|
27
|
+
=> #<SkinnyJeans::Pageview id: 1, date: "2010-10-01", path: "my-first-post", pageview_count: 3>
|
|
28
|
+
1. NOTE: for now *you have to monkey patch the SkinnyJeans#parse_string_as_date*
|
|
29
|
+
2. Parse oldest logs first, then run regularly against your main log, let logrotate handle the rest (skinny_jeans remembers where it left off)
|
|
30
|
+
3. ASSUMES reading log files in ascending order, keeps track of last line read so you could put it on a scheduler or cron job
|
|
31
|
+
4. access the 2 activerecord classes, sj.pageview (returns Pageview class), and sj.update
|
|
32
|
+
5. enjoy the skinny jeans
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
== PERFORMANCE
|
|
36
|
+
* it parses 100,000 lines in < 2.5 seconds
|
|
37
|
+
* persists 1,000 requests with 2 compound indexes in 15 seconds, or 10 seconds with home_run c extension
|
data/Rakefile
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
require 'rake'
|
|
2
|
+
begin
|
|
3
|
+
require 'jeweler'
|
|
4
|
+
Jeweler::Tasks.new do |s|
|
|
5
|
+
s.name = "skinny_jeans"
|
|
6
|
+
s.summary = "Fast webserver log parser for persisting daily pageviews per path to sqlite"
|
|
7
|
+
s.email = "jonathan.otto@gmail.com"
|
|
8
|
+
s.homepage = "http://github.com/jotto/skinny_jeans"
|
|
9
|
+
s.authors = ["Jonathan Otto"]
|
|
10
|
+
s.add_dependency 'sqlite3-ruby', '>= 1.2.4'
|
|
11
|
+
s.add_dependency 'activerecord', '>= 2.3.8'
|
|
12
|
+
end
|
|
13
|
+
rescue LoadError
|
|
14
|
+
puts "Jeweler, or one of its dependencies, is not available. Install it with: sudo gem install jeweler"
|
|
15
|
+
end
|
data/VERSION
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.2.2
|
data/lib/skinny_jeans.rb
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
require 'time'
|
|
2
|
+
require 'benchmark'
|
|
3
|
+
require 'rubygems'
|
|
4
|
+
require 'sqlite3'
|
|
5
|
+
require 'active_record'
|
|
6
|
+
require 'zlib'
|
|
7
|
+
# require 'home_run'
|
|
8
|
+
|
|
9
|
+
class SkinnyJeans
|
|
10
|
+
|
|
11
|
+
def self.execute(logfile_path, sqlite_db_path, path_regexp, date_regexp)
|
|
12
|
+
self.new(logfile_path, sqlite_db_path, path_regexp, date_regexp).execute
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
attr_accessor :hash_of_dates, :last_pageview_at
|
|
16
|
+
|
|
17
|
+
def initialize(logfile_path, sqlite_db_path, path_regexp, date_regexp)
|
|
18
|
+
@logfile_path, @sqlite_db_path, @path_regexp, @date_regexp = [logfile_path, sqlite_db_path, path_regexp, date_regexp]
|
|
19
|
+
@is_gzipped = !logfile_path.to_s[/gz/].nil?
|
|
20
|
+
prepare_db
|
|
21
|
+
@hash_of_dates = {}
|
|
22
|
+
@last_datetime = nil
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def prepare_db
|
|
26
|
+
# create database if necessary
|
|
27
|
+
SQLite3::Database.new(@sqlite_db_path)
|
|
28
|
+
ActiveRecord::Base.establish_connection(:adapter => 'sqlite3', :database => @sqlite_db_path)
|
|
29
|
+
# create tables if necessary
|
|
30
|
+
if !Pageview.table_exists?
|
|
31
|
+
ActiveRecord::Base.connection.create_table(:pageviews) do |t|
|
|
32
|
+
t.column :date, :date
|
|
33
|
+
t.column :path, :string
|
|
34
|
+
t.column :pageview_count, :integer
|
|
35
|
+
end
|
|
36
|
+
# flow tight like skinny jeans with these compound indexes
|
|
37
|
+
ActiveRecord::Base.connection.add_index(:pageviews, [:date, :path], :name => "date_path_index")
|
|
38
|
+
ActiveRecord::Base.connection.add_index(:pageviews, [:date, :pageview_count], :name => "date_pageview_count_index")
|
|
39
|
+
end
|
|
40
|
+
if !Update.table_exists?
|
|
41
|
+
ActiveRecord::Base.connection.create_table(:updates) do |t|
|
|
42
|
+
t.column :last_pageview_at, :timestamp
|
|
43
|
+
t.column :lines_parsed, :integer
|
|
44
|
+
t.column :last_line_parsed, :string
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def execute
|
|
50
|
+
|
|
51
|
+
lines_parsed = 0
|
|
52
|
+
last_line_parsed, last_pageview_at, lineno_of_last_line_parsed = [nil,nil,nil]
|
|
53
|
+
# last_update = Update.order("id DESC").limit(1).first
|
|
54
|
+
last_update = Update.find(:first, :order => "id DESC", :limit => 1)
|
|
55
|
+
|
|
56
|
+
# see if the last_line_parsed parsed exists in the current log file
|
|
57
|
+
# if it doesnt exist, we'll simply read anything with a timestamp greater than last_pageview_at
|
|
58
|
+
if last_update
|
|
59
|
+
last_pageview_at, last_line_parsed = last_update.last_pageview_at, last_update.last_line_parsed
|
|
60
|
+
file_reader do |line, lineno|
|
|
61
|
+
if line == last_line_parsed
|
|
62
|
+
lineno_of_last_line_parsed = lineno
|
|
63
|
+
break
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
puts "last line parsed was\n#{last_line_parsed}\nat lineno #{lineno_of_last_line_parsed}"
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
realtime = Benchmark.realtime do
|
|
70
|
+
date_path_pairs_array = []
|
|
71
|
+
lineno = -1
|
|
72
|
+
|
|
73
|
+
file_reader do |line, index|
|
|
74
|
+
lineno += 1
|
|
75
|
+
next if lineno_of_last_line_parsed && lineno < lineno_of_last_line_parsed
|
|
76
|
+
|
|
77
|
+
path_match = line[@path_regexp, 1]
|
|
78
|
+
next if path_match.nil?
|
|
79
|
+
date_match = line[@date_regexp, 1]
|
|
80
|
+
next if date_match.nil?
|
|
81
|
+
time_object = parse_string_as_date(date_match)
|
|
82
|
+
|
|
83
|
+
next if lineno_of_last_line_parsed.nil? && !last_pageview_at.nil? && time_object < last_pageview_at
|
|
84
|
+
|
|
85
|
+
insert_or_increment([time_object,path_match])
|
|
86
|
+
last_line_parsed = line
|
|
87
|
+
lines_parsed += 1
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
puts "completed parsing in #{realtime}"
|
|
92
|
+
|
|
93
|
+
persisted = 0
|
|
94
|
+
realtime = Benchmark.realtime do
|
|
95
|
+
hash_of_dates.each do |date, hash_of_paths|
|
|
96
|
+
hash_of_paths.keys.each do |path|
|
|
97
|
+
pv = Pageview.find_or_create_by_date_and_path(date, path)
|
|
98
|
+
pv.pageview_count ||= 0
|
|
99
|
+
pv.pageview_count += hash_of_paths[path]
|
|
100
|
+
pv.save!
|
|
101
|
+
persisted += 1
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
puts "completed persistence in #{realtime}"
|
|
107
|
+
|
|
108
|
+
Update.create!({:last_pageview_at => self.last_pageview_at, :lines_parsed => lines_parsed, :last_line_parsed => last_line_parsed})
|
|
109
|
+
|
|
110
|
+
puts "total records in DB: #{Pageview.count}\nlines parsed this round: #{lines_parsed}\nlines persisted this round:#{persisted}\ntotal SkinnyJeans executions since inception: #{Update.count}"
|
|
111
|
+
|
|
112
|
+
return self
|
|
113
|
+
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def file_reader
|
|
117
|
+
if @is_gzipped
|
|
118
|
+
lineno = 0
|
|
119
|
+
Zlib::GzipReader.open(@logfile_path){|line|yield([line.read,lineno]);lineno+=1}
|
|
120
|
+
else
|
|
121
|
+
File.new(@logfile_path, "r").each_with_index{|line, lineno| yield([line,lineno])}
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def pageview;get_ar_class(Pageview);end
|
|
126
|
+
def update;get_ar_class(Update);end
|
|
127
|
+
|
|
128
|
+
def get_ar_class(klass)
|
|
129
|
+
begin;return(klass);rescue(ActiveRecord::ConnectionNotEstablished);prepare_db;end
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
private
|
|
133
|
+
|
|
134
|
+
# return a ruby Time object
|
|
135
|
+
def parse_string_as_date(date_string = "02/Oct/2010:11:17:44 -0700")
|
|
136
|
+
day,month,year,hour,minute,seconds,zone = date_string.scan(/(\d{1,2})\/(\w{3,5})\/(\d{4}):(\d\d):(\d\d):(\d\d)\s(-?\d{3,4})/).flatten
|
|
137
|
+
Time.parse("#{year}-#{month}-#{day} #{hour}:#{minute}:#{seconds} #{zone}")
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
def insert_or_increment(date_path_pair)
|
|
141
|
+
datetime, path = date_path_pair
|
|
142
|
+
date = datetime.strftime(("%Y-%m-%d"))
|
|
143
|
+
hash_of_dates[date] ||= {}
|
|
144
|
+
hash_of_dates[date][path] ||= 0
|
|
145
|
+
hash_of_dates[date][path] += 1
|
|
146
|
+
@last_pageview_at = datetime
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
class Pageview < ActiveRecord::Base
|
|
150
|
+
end
|
|
151
|
+
class Update < ActiveRecord::Base
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
# SkinnyJeans::execute(ARGV.first) if "#{$0}".gsub(/.*\//,"") == "skinny_jeans.rb"
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# Generated by jeweler
|
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
|
4
|
+
# -*- encoding: utf-8 -*-
|
|
5
|
+
|
|
6
|
+
Gem::Specification.new do |s|
|
|
7
|
+
s.name = %q{skinny_jeans}
|
|
8
|
+
s.version = "0.2.2"
|
|
9
|
+
|
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
|
11
|
+
s.authors = ["Jonathan Otto"]
|
|
12
|
+
s.date = %q{2010-10-04}
|
|
13
|
+
s.email = %q{jonathan.otto@gmail.com}
|
|
14
|
+
s.extra_rdoc_files = [
|
|
15
|
+
"README.rdoc"
|
|
16
|
+
]
|
|
17
|
+
s.files = [
|
|
18
|
+
".gitignore",
|
|
19
|
+
"README.rdoc",
|
|
20
|
+
"Rakefile",
|
|
21
|
+
"VERSION",
|
|
22
|
+
"lib/skinny_jeans.rb",
|
|
23
|
+
"skinny_jeans.gemspec"
|
|
24
|
+
]
|
|
25
|
+
s.homepage = %q{http://github.com/jotto/skinny_jeans}
|
|
26
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
|
27
|
+
s.require_paths = ["lib"]
|
|
28
|
+
s.rubygems_version = %q{1.3.7}
|
|
29
|
+
s.summary = %q{Fast webserver log parser for persisting daily pageviews per path to sqlite}
|
|
30
|
+
|
|
31
|
+
if s.respond_to? :specification_version then
|
|
32
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
|
33
|
+
s.specification_version = 3
|
|
34
|
+
|
|
35
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
|
36
|
+
s.add_runtime_dependency(%q<sqlite3-ruby>, [">= 1.2.4"])
|
|
37
|
+
s.add_runtime_dependency(%q<activerecord>, [">= 2.3.8"])
|
|
38
|
+
else
|
|
39
|
+
s.add_dependency(%q<sqlite3-ruby>, [">= 1.2.4"])
|
|
40
|
+
s.add_dependency(%q<activerecord>, [">= 2.3.8"])
|
|
41
|
+
end
|
|
42
|
+
else
|
|
43
|
+
s.add_dependency(%q<sqlite3-ruby>, [">= 1.2.4"])
|
|
44
|
+
s.add_dependency(%q<activerecord>, [">= 2.3.8"])
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
metadata
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: skinny_jeans
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
hash: 19
|
|
5
|
+
prerelease: false
|
|
6
|
+
segments:
|
|
7
|
+
- 0
|
|
8
|
+
- 2
|
|
9
|
+
- 2
|
|
10
|
+
version: 0.2.2
|
|
11
|
+
platform: ruby
|
|
12
|
+
authors:
|
|
13
|
+
- Jonathan Otto
|
|
14
|
+
autorequire:
|
|
15
|
+
bindir: bin
|
|
16
|
+
cert_chain: []
|
|
17
|
+
|
|
18
|
+
date: 2010-10-04 00:00:00 -05:00
|
|
19
|
+
default_executable:
|
|
20
|
+
dependencies:
|
|
21
|
+
- !ruby/object:Gem::Dependency
|
|
22
|
+
name: sqlite3-ruby
|
|
23
|
+
prerelease: false
|
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
|
25
|
+
none: false
|
|
26
|
+
requirements:
|
|
27
|
+
- - ">="
|
|
28
|
+
- !ruby/object:Gem::Version
|
|
29
|
+
hash: 23
|
|
30
|
+
segments:
|
|
31
|
+
- 1
|
|
32
|
+
- 2
|
|
33
|
+
- 4
|
|
34
|
+
version: 1.2.4
|
|
35
|
+
type: :runtime
|
|
36
|
+
version_requirements: *id001
|
|
37
|
+
- !ruby/object:Gem::Dependency
|
|
38
|
+
name: activerecord
|
|
39
|
+
prerelease: false
|
|
40
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
|
41
|
+
none: false
|
|
42
|
+
requirements:
|
|
43
|
+
- - ">="
|
|
44
|
+
- !ruby/object:Gem::Version
|
|
45
|
+
hash: 19
|
|
46
|
+
segments:
|
|
47
|
+
- 2
|
|
48
|
+
- 3
|
|
49
|
+
- 8
|
|
50
|
+
version: 2.3.8
|
|
51
|
+
type: :runtime
|
|
52
|
+
version_requirements: *id002
|
|
53
|
+
description:
|
|
54
|
+
email: jonathan.otto@gmail.com
|
|
55
|
+
executables: []
|
|
56
|
+
|
|
57
|
+
extensions: []
|
|
58
|
+
|
|
59
|
+
extra_rdoc_files:
|
|
60
|
+
- README.rdoc
|
|
61
|
+
files:
|
|
62
|
+
- .gitignore
|
|
63
|
+
- README.rdoc
|
|
64
|
+
- Rakefile
|
|
65
|
+
- VERSION
|
|
66
|
+
- lib/skinny_jeans.rb
|
|
67
|
+
- skinny_jeans.gemspec
|
|
68
|
+
has_rdoc: true
|
|
69
|
+
homepage: http://github.com/jotto/skinny_jeans
|
|
70
|
+
licenses: []
|
|
71
|
+
|
|
72
|
+
post_install_message:
|
|
73
|
+
rdoc_options:
|
|
74
|
+
- --charset=UTF-8
|
|
75
|
+
require_paths:
|
|
76
|
+
- lib
|
|
77
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
78
|
+
none: false
|
|
79
|
+
requirements:
|
|
80
|
+
- - ">="
|
|
81
|
+
- !ruby/object:Gem::Version
|
|
82
|
+
hash: 3
|
|
83
|
+
segments:
|
|
84
|
+
- 0
|
|
85
|
+
version: "0"
|
|
86
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
87
|
+
none: false
|
|
88
|
+
requirements:
|
|
89
|
+
- - ">="
|
|
90
|
+
- !ruby/object:Gem::Version
|
|
91
|
+
hash: 3
|
|
92
|
+
segments:
|
|
93
|
+
- 0
|
|
94
|
+
version: "0"
|
|
95
|
+
requirements: []
|
|
96
|
+
|
|
97
|
+
rubyforge_project:
|
|
98
|
+
rubygems_version: 1.3.7
|
|
99
|
+
signing_key:
|
|
100
|
+
specification_version: 3
|
|
101
|
+
summary: Fast webserver log parser for persisting daily pageviews per path to sqlite
|
|
102
|
+
test_files: []
|
|
103
|
+
|