skinny_jeans 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.db
2
+ *.gz
3
+ *.log
4
+ pkg
5
+ .DS*
data/README.rdoc ADDED
@@ -0,0 +1,37 @@
1
+ = SKINNY JEANS LOG PARSING WITH RUBY & SQLITE FOR HIPSTERS
2
+ http://img696.imageshack.us/img696/75/skinnys3.jpg
3
+
4
+ == EXAMPLE
5
+
6
+ * your log file has lines that look like
7
+
8
+ 0.0.0.0 - - [01/Oct/2010:00:00:00 -0700] "GET /posts/my-first-post HTTP/1.1" 200 1337 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-"
9
+ 0.0.0.0 - - [01/Oct/2010:00:00:01 -0700] "GET /posts/my-first-post HTTP/1.1" 200 1337 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-"
10
+ 0.0.0.0 - - [01/Oct/2010:00:00:03 -0700] "GET /posts/my-first-post HTTP/1.1" 200 1337 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-"
11
+ 0.0.0.0 - - [02/Oct/2010:00:00:03 -0700] "GET /posts/my-first-post HTTP/1.1" 200 1337 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-"
12
+
13
+ * then you get 2 SQL rows that looks like:
14
+ 2010-10-01, my-first-post, 3
15
+ 2010-10-02, my-first-post, 1
16
+ * note the date columns truncate timestamp, so the days are in whatever timezone your log file reports in
17
+
18
+
19
+ == WHY?
20
+ * so you can query a database by date and path and get pageviews and have that data stored CHEAP
21
+ * because i couldn't find anything simpler and Google Analytics is limited to 10,000 API requests per day
22
+
23
+
24
+ == USAGE
25
+ sj = SkinnyJeans::execute(logfile_path = "access.log", sqlite_skinny_jeans = "sqlite_skinny_jeans.db", path_regexp = /\s\/posts\/(.*)\sHTTP/, date_regexp = /\[(\d.*\d)\]/)
26
+ sj.pageview.where("date = '2010-10-01' and path = 'my-first-post'")
27
+ => #<SkinnyJeans::Pageview id: 1, date: "2010-10-01", path: "my-first-post", pageview_count: 3>
28
+ 1. NOTE: for now *you have to monkey patch the SkinnyJeans#parse_string_as_date*
29
+ 2. Parse oldest logs first, then run regularly against your main log, let logrotate handle the rest (skinny_jeans remembers where it left off)
30
+ 3. ASSUMES reading log files in ascending order, keeps track of last line read so you could put it on a scheduler or cron job
31
+ 4. access the 2 activerecord classes, sj.pageview (returns Pageview class), and sj.update
32
+ 5. enjoy the skinny jeans
33
+
34
+
35
+ == PERFORMANCE
36
+ * it parses 100,000 lines in < 2.5 seconds
37
+ * persists 1,000 requests with 2 compound indexes in 15 seconds, or 10 seconds with home_run c extension
data/Rakefile ADDED
@@ -0,0 +1,15 @@
1
+ require 'rake'
2
+ begin
3
+ require 'jeweler'
4
+ Jeweler::Tasks.new do |s|
5
+ s.name = "skinny_jeans"
6
+ s.summary = "Fast webserver log parser for persisting daily pageviews per path to sqlite"
7
+ s.email = "jonathan.otto@gmail.com"
8
+ s.homepage = "http://github.com/jotto/skinny_jeans"
9
+ s.authors = ["Jonathan Otto"]
10
+ s.add_dependency 'sqlite3-ruby', '>= 1.2.4'
11
+ s.add_dependency 'activerecord', '>= 2.3.8'
12
+ end
13
+ rescue LoadError
14
+ puts "Jeweler, or one of its dependencies, is not available. Install it with: sudo gem install jeweler"
15
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.2.2
@@ -0,0 +1,157 @@
1
+ require 'time'
2
+ require 'benchmark'
3
+ require 'rubygems'
4
+ require 'sqlite3'
5
+ require 'active_record'
6
+ require 'zlib'
7
+ # require 'home_run'
8
+
9
+ class SkinnyJeans
10
+
11
+ def self.execute(logfile_path, sqlite_db_path, path_regexp, date_regexp)
12
+ self.new(logfile_path, sqlite_db_path, path_regexp, date_regexp).execute
13
+ end
14
+
15
+ attr_accessor :hash_of_dates, :last_pageview_at
16
+
17
+ def initialize(logfile_path, sqlite_db_path, path_regexp, date_regexp)
18
+ @logfile_path, @sqlite_db_path, @path_regexp, @date_regexp = [logfile_path, sqlite_db_path, path_regexp, date_regexp]
19
+ @is_gzipped = !logfile_path.to_s[/gz/].nil?
20
+ prepare_db
21
+ @hash_of_dates = {}
22
+ @last_datetime = nil
23
+ end
24
+
25
+ def prepare_db
26
+ # create database if necessary
27
+ SQLite3::Database.new(@sqlite_db_path)
28
+ ActiveRecord::Base.establish_connection(:adapter => 'sqlite3', :database => @sqlite_db_path)
29
+ # create tables if necessary
30
+ if !Pageview.table_exists?
31
+ ActiveRecord::Base.connection.create_table(:pageviews) do |t|
32
+ t.column :date, :date
33
+ t.column :path, :string
34
+ t.column :pageview_count, :integer
35
+ end
36
+ # flow tight like skinny jeans with these compound indexes
37
+ ActiveRecord::Base.connection.add_index(:pageviews, [:date, :path], :name => "date_path_index")
38
+ ActiveRecord::Base.connection.add_index(:pageviews, [:date, :pageview_count], :name => "date_pageview_count_index")
39
+ end
40
+ if !Update.table_exists?
41
+ ActiveRecord::Base.connection.create_table(:updates) do |t|
42
+ t.column :last_pageview_at, :timestamp
43
+ t.column :lines_parsed, :integer
44
+ t.column :last_line_parsed, :string
45
+ end
46
+ end
47
+ end
48
+
49
+ def execute
50
+
51
+ lines_parsed = 0
52
+ last_line_parsed, last_pageview_at, lineno_of_last_line_parsed = [nil,nil,nil]
53
+ # last_update = Update.order("id DESC").limit(1).first
54
+ last_update = Update.find(:first, :order => "id DESC", :limit => 1)
55
+
56
+ # see if the last_line_parsed parsed exists in the current log file
57
+ # if it doesnt exist, we'll simply read anything with a timestamp greater than last_pageview_at
58
+ if last_update
59
+ last_pageview_at, last_line_parsed = last_update.last_pageview_at, last_update.last_line_parsed
60
+ file_reader do |line, lineno|
61
+ if line == last_line_parsed
62
+ lineno_of_last_line_parsed = lineno
63
+ break
64
+ end
65
+ end
66
+ puts "last line parsed was\n#{last_line_parsed}\nat lineno #{lineno_of_last_line_parsed}"
67
+ end
68
+
69
+ realtime = Benchmark.realtime do
70
+ date_path_pairs_array = []
71
+ lineno = -1
72
+
73
+ file_reader do |line, index|
74
+ lineno += 1
75
+ next if lineno_of_last_line_parsed && lineno < lineno_of_last_line_parsed
76
+
77
+ path_match = line[@path_regexp, 1]
78
+ next if path_match.nil?
79
+ date_match = line[@date_regexp, 1]
80
+ next if date_match.nil?
81
+ time_object = parse_string_as_date(date_match)
82
+
83
+ next if lineno_of_last_line_parsed.nil? && !last_pageview_at.nil? && time_object < last_pageview_at
84
+
85
+ insert_or_increment([time_object,path_match])
86
+ last_line_parsed = line
87
+ lines_parsed += 1
88
+ end
89
+ end
90
+
91
+ puts "completed parsing in #{realtime}"
92
+
93
+ persisted = 0
94
+ realtime = Benchmark.realtime do
95
+ hash_of_dates.each do |date, hash_of_paths|
96
+ hash_of_paths.keys.each do |path|
97
+ pv = Pageview.find_or_create_by_date_and_path(date, path)
98
+ pv.pageview_count ||= 0
99
+ pv.pageview_count += hash_of_paths[path]
100
+ pv.save!
101
+ persisted += 1
102
+ end
103
+ end
104
+ end
105
+
106
+ puts "completed persistence in #{realtime}"
107
+
108
+ Update.create!({:last_pageview_at => self.last_pageview_at, :lines_parsed => lines_parsed, :last_line_parsed => last_line_parsed})
109
+
110
+ puts "total records in DB: #{Pageview.count}\nlines parsed this round: #{lines_parsed}\nlines persisted this round:#{persisted}\ntotal SkinnyJeans executions since inception: #{Update.count}"
111
+
112
+ return self
113
+
114
+ end
115
+
116
+ def file_reader
117
+ if @is_gzipped
118
+ lineno = 0
119
+ Zlib::GzipReader.open(@logfile_path){|line|yield([line.read,lineno]);lineno+=1}
120
+ else
121
+ File.new(@logfile_path, "r").each_with_index{|line, lineno| yield([line,lineno])}
122
+ end
123
+ end
124
+
125
+ def pageview;get_ar_class(Pageview);end
126
+ def update;get_ar_class(Update);end
127
+
128
+ def get_ar_class(klass)
129
+ begin;return(klass);rescue(ActiveRecord::ConnectionNotEstablished);prepare_db;end
130
+ end
131
+
132
+ private
133
+
134
+ # return a ruby Time object
135
+ def parse_string_as_date(date_string = "02/Oct/2010:11:17:44 -0700")
136
+ day,month,year,hour,minute,seconds,zone = date_string.scan(/(\d{1,2})\/(\w{3,5})\/(\d{4}):(\d\d):(\d\d):(\d\d)\s(-?\d{3,4})/).flatten
137
+ Time.parse("#{year}-#{month}-#{day} #{hour}:#{minute}:#{seconds} #{zone}")
138
+ end
139
+
140
+ def insert_or_increment(date_path_pair)
141
+ datetime, path = date_path_pair
142
+ date = datetime.strftime(("%Y-%m-%d"))
143
+ hash_of_dates[date] ||= {}
144
+ hash_of_dates[date][path] ||= 0
145
+ hash_of_dates[date][path] += 1
146
+ @last_pageview_at = datetime
147
+ end
148
+
149
+ class Pageview < ActiveRecord::Base
150
+ end
151
+ class Update < ActiveRecord::Base
152
+ end
153
+
154
+
155
+ end
156
+
157
+ # SkinnyJeans::execute(ARGV.first) if "#{$0}".gsub(/.*\//,"") == "skinny_jeans.rb"
@@ -0,0 +1,47 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{skinny_jeans}
8
+ s.version = "0.2.2"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Jonathan Otto"]
12
+ s.date = %q{2010-10-04}
13
+ s.email = %q{jonathan.otto@gmail.com}
14
+ s.extra_rdoc_files = [
15
+ "README.rdoc"
16
+ ]
17
+ s.files = [
18
+ ".gitignore",
19
+ "README.rdoc",
20
+ "Rakefile",
21
+ "VERSION",
22
+ "lib/skinny_jeans.rb",
23
+ "skinny_jeans.gemspec"
24
+ ]
25
+ s.homepage = %q{http://github.com/jotto/skinny_jeans}
26
+ s.rdoc_options = ["--charset=UTF-8"]
27
+ s.require_paths = ["lib"]
28
+ s.rubygems_version = %q{1.3.7}
29
+ s.summary = %q{Fast webserver log parser for persisting daily pageviews per path to sqlite}
30
+
31
+ if s.respond_to? :specification_version then
32
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
33
+ s.specification_version = 3
34
+
35
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
36
+ s.add_runtime_dependency(%q<sqlite3-ruby>, [">= 1.2.4"])
37
+ s.add_runtime_dependency(%q<activerecord>, [">= 2.3.8"])
38
+ else
39
+ s.add_dependency(%q<sqlite3-ruby>, [">= 1.2.4"])
40
+ s.add_dependency(%q<activerecord>, [">= 2.3.8"])
41
+ end
42
+ else
43
+ s.add_dependency(%q<sqlite3-ruby>, [">= 1.2.4"])
44
+ s.add_dependency(%q<activerecord>, [">= 2.3.8"])
45
+ end
46
+ end
47
+
metadata ADDED
@@ -0,0 +1,103 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: skinny_jeans
3
+ version: !ruby/object:Gem::Version
4
+ hash: 19
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 2
9
+ - 2
10
+ version: 0.2.2
11
+ platform: ruby
12
+ authors:
13
+ - Jonathan Otto
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-10-04 00:00:00 -05:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: sqlite3-ruby
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 23
30
+ segments:
31
+ - 1
32
+ - 2
33
+ - 4
34
+ version: 1.2.4
35
+ type: :runtime
36
+ version_requirements: *id001
37
+ - !ruby/object:Gem::Dependency
38
+ name: activerecord
39
+ prerelease: false
40
+ requirement: &id002 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ hash: 19
46
+ segments:
47
+ - 2
48
+ - 3
49
+ - 8
50
+ version: 2.3.8
51
+ type: :runtime
52
+ version_requirements: *id002
53
+ description:
54
+ email: jonathan.otto@gmail.com
55
+ executables: []
56
+
57
+ extensions: []
58
+
59
+ extra_rdoc_files:
60
+ - README.rdoc
61
+ files:
62
+ - .gitignore
63
+ - README.rdoc
64
+ - Rakefile
65
+ - VERSION
66
+ - lib/skinny_jeans.rb
67
+ - skinny_jeans.gemspec
68
+ has_rdoc: true
69
+ homepage: http://github.com/jotto/skinny_jeans
70
+ licenses: []
71
+
72
+ post_install_message:
73
+ rdoc_options:
74
+ - --charset=UTF-8
75
+ require_paths:
76
+ - lib
77
+ required_ruby_version: !ruby/object:Gem::Requirement
78
+ none: false
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ hash: 3
83
+ segments:
84
+ - 0
85
+ version: "0"
86
+ required_rubygems_version: !ruby/object:Gem::Requirement
87
+ none: false
88
+ requirements:
89
+ - - ">="
90
+ - !ruby/object:Gem::Version
91
+ hash: 3
92
+ segments:
93
+ - 0
94
+ version: "0"
95
+ requirements: []
96
+
97
+ rubyforge_project:
98
+ rubygems_version: 1.3.7
99
+ signing_key:
100
+ specification_version: 3
101
+ summary: Fast webserver log parser for persisting daily pageviews per path to sqlite
102
+ test_files: []
103
+