skinny-jeans 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -1,7 +1,7 @@
1
1
  = SKINNY JEANS LOG PARSING WITH RUBY & SQLITE FOR HIPSTERS
2
2
  http://img696.imageshack.us/img696/75/skinnys3.jpg
3
3
 
4
- == example
4
+ == EXAMPLE
5
5
 
6
6
  * your log file has lines that look like
7
7
 
@@ -10,9 +10,10 @@ http://img696.imageshack.us/img696/75/skinnys3.jpg
10
10
  0.0.0.0 - - [01/Oct/2010:00:00:03 -0700] "GET /posts/my-first-post HTTP/1.1" 200 1337 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-"
11
11
  0.0.0.0 - - [02/Oct/2010:00:00:03 -0700] "GET /posts/my-first-post HTTP/1.1" 200 1337 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-"
12
12
 
13
- * then you get 1 SQL row that looks like:
13
+ * then you get 2 SQL rows that looks like:
14
14
  2010-10-01, my-first-post, 3
15
15
  2010-10-02, my-first-post, 1
16
+ * note the date columns truncate timestamp, so the days are in whatever timezone your log file reports in
16
17
 
17
18
 
18
19
  == WHY?
@@ -20,18 +21,17 @@ http://img696.imageshack.us/img696/75/skinnys3.jpg
20
21
  * because i couldn't find anything simpler and Google Analytics is limited to 10,000 API requests per day
21
22
 
22
23
 
23
- == Usage
24
- SkinnyJeans::execute(logfile_path = "access.log", sqlite_skinny_jeans = "sqlite_skinny_jeans.db", path_regexp = /\s\/posts\/(.*)\sHTTP/, date_regexp = /\[(\d.*\d)\]/)
25
- 1. Parse oldest logs first, then run regularly against your main log, let logrotate handle the rest (skinny_jeans remembers where it left off)
26
- 2. enjoy the skinny jeans
24
+ == USAGE
25
+ sj = SkinnyJeans::execute(logfile_path = "access.log", sqlite_skinny_jeans = "sqlite_skinny_jeans.db", path_regexp = /\s\/posts\/(.*)\sHTTP/, date_regexp = /\[(\d.*\d)\]/)
26
+ sj.pageview.where("date = '2010-10-01' and path = 'my-first-post'")
27
+ => #<SkinnyJeans::Pageview id: 1, date: "2010-10-01", path: "my-first-post", pageview_count: 3>
28
+ 1. NOTE: for now *you have to monkey patch the SkinnyJeans#parse_string_as_date*
29
+ 2. Parse oldest logs first, then run regularly against your main log, let logrotate handle the rest (skinny_jeans remembers where it left off)
30
+ 3. ASSUMES reading log files in ascending order, keeps track of last line read so you could put it on a scheduler or cron job
31
+ 4. access the 2 activerecord classes, sj.pageview (returns Pageview class), and sj.update
32
+ 5. enjoy the skinny jeans
27
33
 
28
34
 
29
- == WHAT IT DO
35
+ == PERFORMANCE
30
36
  * it parses 100,000 lines in < 2.5 seconds
31
- * persists 1,000 requests with 2 compound indexes in 10 seconds (using home_run c date parser, 15 seconds without home_run)
32
-
33
- * parse a webserver's log file to aggregate paths by DAY with pageview counts
34
-
35
- * creates sqlite database with columns: date, path, pageview_count
36
-
37
- * ASSUMES reading log files in ascending order, keeps track of last line read so you could put it on a scheduler or cron job
37
+ * persists 1,000 requests with 2 compound indexes in 15 seconds, or 10 seconds with home_run c extension
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.0
1
+ 0.2.0
data/lib/skinny_jeans.rb CHANGED
@@ -3,115 +3,137 @@ require 'benchmark'
3
3
  require 'rubygems'
4
4
  require 'sqlite3'
5
5
  require 'active_record'
6
+ require 'zlib'
6
7
  # require 'home_run'
7
8
 
8
- class Pageview < ActiveRecord::Base
9
- end
10
- class Update < ActiveRecord::Base
11
- end
12
-
13
9
  class SkinnyJeans
14
- class << self
15
-
16
- def prepare_db(sqlite_db_path)
17
- # create database if necessary
18
- SQLite3::Database.new(sqlite_db_path)
19
- ActiveRecord::Base.establish_connection(:adapter => 'sqlite3', :database => sqlite_db_path)
20
- # create tables if necessary
21
- if !Pageview.table_exists?
22
- ActiveRecord::Base.connection.create_table(:pageviews) do |t|
23
- t.column :date, :date
24
- t.column :path, :string
25
- t.column :pageview_count, :integer
26
- end
27
- # flow tight like skinny jeans with these compound indexes
28
- ActiveRecord::Base.connection.add_index(:pageviews, [:date, :path], :name => "date_path_index")
29
- ActiveRecord::Base.connection.add_index(:pageviews, [:date, :pageview_count], :name => "date_pageview_count_index")
10
+
11
+ def self.execute(logfile_path, sqlite_db_path, path_regexp, date_regexp)
12
+ self.new(logfile_path, sqlite_db_path, path_regexp, date_regexp).execute
13
+ end
14
+
15
+ attr_accessor :hash_of_dates, :last_pageview_at
16
+
17
+ def initialize(logfile_path, sqlite_db_path, path_regexp, date_regexp)
18
+ @logfile_path, @sqlite_db_path, @path_regexp, @date_regexp = [logfile_path, sqlite_db_path, path_regexp, date_regexp]
19
+ @is_gzipped = !logfile_path.to_s[/gz/].nil?
20
+ prepare_db
21
+ @hash_of_dates = {}
22
+ @last_datetime = nil
23
+ end
24
+
25
+ def prepare_db
26
+ # create database if necessary
27
+ SQLite3::Database.new(@sqlite_db_path)
28
+ ActiveRecord::Base.establish_connection(:adapter => 'sqlite3', :database => @sqlite_db_path)
29
+ # create tables if necessary
30
+ if !Pageview.table_exists?
31
+ ActiveRecord::Base.connection.create_table(:pageviews) do |t|
32
+ t.column :date, :date
33
+ t.column :path, :string
34
+ t.column :pageview_count, :integer
30
35
  end
31
- if !Update.table_exists?
32
- ActiveRecord::Base.connection.create_table(:updates) do |t|
33
- t.column :last_pageview_at, :timestamp
34
- t.column :lines_parsed, :integer
35
- t.column :last_line_parsed, :string
36
- end
36
+ # flow tight like skinny jeans with these compound indexes
37
+ ActiveRecord::Base.connection.add_index(:pageviews, [:date, :path], :name => "date_path_index")
38
+ ActiveRecord::Base.connection.add_index(:pageviews, [:date, :pageview_count], :name => "date_pageview_count_index")
39
+ end
40
+ if !Update.table_exists?
41
+ ActiveRecord::Base.connection.create_table(:updates) do |t|
42
+ t.column :last_pageview_at, :timestamp
43
+ t.column :lines_parsed, :integer
44
+ t.column :last_line_parsed, :string
37
45
  end
38
46
  end
47
+ end
48
+
49
+ def execute
50
+
51
+ lines_parsed = 0
52
+ last_line_parsed, last_pageview_at, lineno_of_last_line_parsed = [nil,nil,nil]
53
+ last_update = Update.order("id DESC").limit(1).first
39
54
 
40
- def execute(logfile_path, sqlite_db_path, path_regexp, date_regexp)
41
-
42
- prepare_db(sqlite_db_path)
43
- skinny_jean = self.new
44
- lines_parsed = 0
45
- last_line_parsed, last_pageview_at, lineno_of_last_line_parsed = [nil,nil,nil]
46
- last_update = Update.order("id DESC").limit(1).first
47
-
48
- if last_update
49
- last_pageview_at, last_line_parsed = last_update.last_pageview_at, last_update.last_line_parsed
50
- File.new(logfile_path, "r").each_with_index do |line, lineno|
51
- if line == last_line_parsed
52
- lineno_of_last_line_parsed = lineno
53
- break
54
- end
55
+ # see if the last_line_parsed parsed exists in the current log file
56
+ # if it doesnt exist, we'll simply read anything with a timestamp greater than last_pageview_at
57
+ if last_update
58
+ last_pageview_at, last_line_parsed = last_update.last_pageview_at, last_update.last_line_parsed
59
+ file_reader do |line, lineno|
60
+ if line == last_line_parsed
61
+ lineno_of_last_line_parsed = lineno
62
+ break
55
63
  end
56
- puts "last line parsed was\n#{last_line_parsed}\nat lineno #{lineno_of_last_line_parsed}"
57
64
  end
65
+ puts "last line parsed was\n#{last_line_parsed}\nat lineno #{lineno_of_last_line_parsed}"
66
+ end
58
67
 
59
- realtime = Benchmark.realtime do
60
- date_path_pairs_array = []
61
-
62
- File.new(logfile_path, "r").each_with_index do |line, lineno|
63
- next if lineno_of_last_line_parsed && lineno < lineno_of_last_line_parsed
68
+ realtime = Benchmark.realtime do
69
+ date_path_pairs_array = []
70
+ lineno = -1
64
71
 
65
- path_match = line[path_regexp, 1]
66
- next if path_match.nil?
67
- date_match = line[date_regexp, 1]
68
- next if date_match.nil?
69
- time_object = parse_string_as_date(date_match)
72
+ file_reader do |line, index|
73
+ lineno += 1
74
+ next if lineno_of_last_line_parsed && lineno < lineno_of_last_line_parsed
70
75
 
71
- next if lineno_of_last_line_parsed.nil? && !last_pageview_at.nil? && time_object < last_pageview_at
76
+ path_match = line[@path_regexp, 1]
77
+ next if path_match.nil?
78
+ date_match = line[@date_regexp, 1]
79
+ next if date_match.nil?
80
+ time_object = parse_string_as_date(date_match)
72
81
 
73
- skinny_jean.insert_or_increment([time_object,path_match])
74
- last_line_parsed = line
75
- lines_parsed += 1
76
- end
82
+ next if lineno_of_last_line_parsed.nil? && !last_pageview_at.nil? && time_object < last_pageview_at
77
83
 
78
- skinny_jean.hash_of_dates
84
+ insert_or_increment([time_object,path_match])
85
+ last_line_parsed = line
86
+ lines_parsed += 1
79
87
  end
88
+ end
80
89
 
81
- puts "completed parsing in #{realtime}"
82
-
83
- realtime = Benchmark.realtime do
84
- skinny_jean.hash_of_dates.each do |date, hash_of_paths|
85
- hash_of_paths.keys.each do |path|
86
- pv = Pageview.find_or_create_by_date_and_path(date, path)
87
- pv.pageview_count ||= 0
88
- pv.pageview_count += hash_of_paths[path]
89
- pv.save!
90
- end
90
+ puts "completed parsing in #{realtime}"
91
+
92
+ persisted = 0
93
+ realtime = Benchmark.realtime do
94
+ hash_of_dates.each do |date, hash_of_paths|
95
+ hash_of_paths.keys.each do |path|
96
+ pv = Pageview.find_or_create_by_date_and_path(date, path)
97
+ pv.pageview_count ||= 0
98
+ pv.pageview_count += hash_of_paths[path]
99
+ pv.save!
100
+ persisted += 1
91
101
  end
92
102
  end
93
-
94
- puts "completed persistence in #{realtime}"
103
+ end
104
+
105
+ puts "completed persistence in #{realtime}"
95
106
 
96
- Update.create!({:last_pageview_at => skinny_jean.last_pageview_at, :lines_parsed => lines_parsed, :last_line_parsed => last_line_parsed})
107
+ Update.create!({:last_pageview_at => self.last_pageview_at, :lines_parsed => lines_parsed, :last_line_parsed => last_line_parsed})
97
108
 
98
- puts "total records in DB: #{Pageview.count}\nlines parsed this round: #{lines_parsed}\ntotal SkinnyJeans executions since inception: #{Update.count}"
109
+ puts "total records in DB: #{Pageview.count}\nlines parsed this round: #{lines_parsed}\nlines persisted this round:#{persisted}\ntotal SkinnyJeans executions since inception: #{Update.count}"
99
110
 
100
- end
111
+ return self
112
+
113
+ end
101
114
 
102
- # return a ruby Time object
103
- def parse_string_as_date(date_string = "02/Oct/2010:11:17:44 -0700")
104
- day,month,year,hour,minute,seconds,zone = date_string.scan(/(\d{1,2})\/(\w{3,5})\/(\d{4}):(\d\d):(\d\d):(\d\d)\s(-?\d{3,4})/).flatten
105
- Time.parse("#{year}-#{month}-#{day} #{hour}:#{minute}:#{seconds} #{zone}")
115
+ def file_reader
116
+ if @is_gzipped
117
+ lineno = 0
118
+ Zlib::GzipReader.open(@logfile_path){|line|yield([line.read,lineno]);lineno+=1}
119
+ else
120
+ File.new(@logfile_path, "r").each_with_index{|line, lineno| yield([line,lineno])}
106
121
  end
122
+ end
123
+
124
+ def pageview;get_ar_class(Pageview);end
125
+ def update;get_ar_class(Update);end
107
126
 
127
+ def get_ar_class(klass)
128
+ begin;return(klass);rescue(ActiveRecord::ConnectionNotEstablished);prepare_db;end
108
129
  end
109
130
 
110
- attr_accessor :hash_of_dates, :last_pageview_at
131
+ private
111
132
 
112
- def initialize
113
- @hash_of_dates = {}
114
- @last_datetime = nil
133
+ # return a ruby Time object
134
+ def parse_string_as_date(date_string = "02/Oct/2010:11:17:44 -0700")
135
+ day,month,year,hour,minute,seconds,zone = date_string.scan(/(\d{1,2})\/(\w{3,5})\/(\d{4}):(\d\d):(\d\d):(\d\d)\s(-?\d{3,4})/).flatten
136
+ Time.parse("#{year}-#{month}-#{day} #{hour}:#{minute}:#{seconds} #{zone}")
115
137
  end
116
138
 
117
139
  def insert_or_increment(date_path_pair)
@@ -123,6 +145,12 @@ class SkinnyJeans
123
145
  @last_pageview_at = datetime
124
146
  end
125
147
 
148
+ class Pageview < ActiveRecord::Base
149
+ end
150
+ class Update < ActiveRecord::Base
151
+ end
152
+
153
+
126
154
  end
127
155
 
128
156
  # SkinnyJeans::execute(ARGV.first) if "#{$0}".gsub(/.*\//,"") == "skinny_jeans.rb"
data/skinny-jeans.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{skinny-jeans}
8
- s.version = "0.1.0"
8
+ s.version = "0.2.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Jonathan Otto"]
12
- s.date = %q{2010-10-03}
12
+ s.date = %q{2010-10-04}
13
13
  s.email = %q{jonathan.otto@gmail.com}
14
14
  s.extra_rdoc_files = [
15
15
  "README.rdoc"
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: skinny-jeans
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
4
+ hash: 23
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 1
8
+ - 2
9
9
  - 0
10
- version: 0.1.0
10
+ version: 0.2.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jonathan Otto
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-10-03 00:00:00 -05:00
18
+ date: 2010-10-04 00:00:00 -05:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency