skinny_jeans 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.db
2
+ *.gz
3
+ *.log
4
+ pkg
5
+ .DS*
data/README.rdoc ADDED
@@ -0,0 +1,37 @@
1
+ = SKINNY JEANS LOG PARSING WITH RUBY & SQLITE FOR HIPSTERS
2
+ http://img696.imageshack.us/img696/75/skinnys3.jpg
3
+
4
+ == EXAMPLE
5
+
6
+ * your log file has lines that look like
7
+
8
+ 0.0.0.0 - - [01/Oct/2010:00:00:00 -0700] "GET /posts/my-first-post HTTP/1.1" 200 1337 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-"
9
+ 0.0.0.0 - - [01/Oct/2010:00:00:01 -0700] "GET /posts/my-first-post HTTP/1.1" 200 1337 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-"
10
+ 0.0.0.0 - - [01/Oct/2010:00:00:03 -0700] "GET /posts/my-first-post HTTP/1.1" 200 1337 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-"
11
+ 0.0.0.0 - - [02/Oct/2010:00:00:03 -0700] "GET /posts/my-first-post HTTP/1.1" 200 1337 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-"
12
+
13
+ * then you get 2 SQL rows that looks like:
14
+ 2010-10-01, my-first-post, 3
15
+ 2010-10-02, my-first-post, 1
16
+ * note the date columns truncate timestamp, so the days are in whatever timezone your log file reports in
17
+
18
+
19
+ == WHY?
20
+ * so you can query a database by date and path and get pageviews and have that data stored CHEAP
21
+ * because i couldn't find anything simpler and Google Analytics is limited to 10,000 API requests per day
22
+
23
+
24
+ == USAGE
25
+ sj = SkinnyJeans::execute(logfile_path = "access.log", sqlite_skinny_jeans = "sqlite_skinny_jeans.db", path_regexp = /\s\/posts\/(.*)\sHTTP/, date_regexp = /\[(\d.*\d)\]/)
26
+ sj.pageview.where("date = '2010-10-01' and path = 'my-first-post'")
27
+ => #<SkinnyJeans::Pageview id: 1, date: "2010-10-01", path: "my-first-post", pageview_count: 3>
28
+ 1. NOTE: for now *you have to monkey patch the SkinnyJeans#parse_string_as_date*
29
+ 2. Parse oldest logs first, then run regularly against your main log, let logrotate handle the rest (skinny_jeans remembers where it left off)
30
+ 3. ASSUMES reading log files in ascending order, keeps track of last line read so you could put it on a scheduler or cron job
31
+ 4. access the 2 activerecord classes, sj.pageview (returns Pageview class), and sj.update
32
+ 5. enjoy the skinny jeans
33
+
34
+
35
+ == PERFORMANCE
36
+ * it parses 100,000 lines in < 2.5 seconds
37
+ * persists 1,000 requests with 2 compound indexes in 15 seconds, or 10 seconds with home_run c extension
data/Rakefile ADDED
@@ -0,0 +1,15 @@
1
+ require 'rake'
2
+ begin
3
+ require 'jeweler'
4
+ Jeweler::Tasks.new do |s|
5
+ s.name = "skinny_jeans"
6
+ s.summary = "Fast webserver log parser for persisting daily pageviews per path to sqlite"
7
+ s.email = "jonathan.otto@gmail.com"
8
+ s.homepage = "http://github.com/jotto/skinny_jeans"
9
+ s.authors = ["Jonathan Otto"]
10
+ s.add_dependency 'sqlite3-ruby', '>= 1.2.4'
11
+ s.add_dependency 'activerecord', '>= 2.3.8'
12
+ end
13
+ rescue LoadError
14
+ puts "Jeweler, or one of its dependencies, is not available. Install it with: sudo gem install jeweler"
15
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.2.2
@@ -0,0 +1,157 @@
1
+ require 'time'
2
+ require 'benchmark'
3
+ require 'rubygems'
4
+ require 'sqlite3'
5
+ require 'active_record'
6
+ require 'zlib'
7
+ # require 'home_run'
8
+
9
+ class SkinnyJeans
10
+
11
+ def self.execute(logfile_path, sqlite_db_path, path_regexp, date_regexp)
12
+ self.new(logfile_path, sqlite_db_path, path_regexp, date_regexp).execute
13
+ end
14
+
15
+ attr_accessor :hash_of_dates, :last_pageview_at
16
+
17
+ def initialize(logfile_path, sqlite_db_path, path_regexp, date_regexp)
18
+ @logfile_path, @sqlite_db_path, @path_regexp, @date_regexp = [logfile_path, sqlite_db_path, path_regexp, date_regexp]
19
+ @is_gzipped = !logfile_path.to_s[/gz/].nil?
20
+ prepare_db
21
+ @hash_of_dates = {}
22
+ @last_datetime = nil
23
+ end
24
+
25
+ def prepare_db
26
+ # create database if necessary
27
+ SQLite3::Database.new(@sqlite_db_path)
28
+ ActiveRecord::Base.establish_connection(:adapter => 'sqlite3', :database => @sqlite_db_path)
29
+ # create tables if necessary
30
+ if !Pageview.table_exists?
31
+ ActiveRecord::Base.connection.create_table(:pageviews) do |t|
32
+ t.column :date, :date
33
+ t.column :path, :string
34
+ t.column :pageview_count, :integer
35
+ end
36
+ # flow tight like skinny jeans with these compound indexes
37
+ ActiveRecord::Base.connection.add_index(:pageviews, [:date, :path], :name => "date_path_index")
38
+ ActiveRecord::Base.connection.add_index(:pageviews, [:date, :pageview_count], :name => "date_pageview_count_index")
39
+ end
40
+ if !Update.table_exists?
41
+ ActiveRecord::Base.connection.create_table(:updates) do |t|
42
+ t.column :last_pageview_at, :timestamp
43
+ t.column :lines_parsed, :integer
44
+ t.column :last_line_parsed, :string
45
+ end
46
+ end
47
+ end
48
+
49
+ def execute
50
+
51
+ lines_parsed = 0
52
+ last_line_parsed, last_pageview_at, lineno_of_last_line_parsed = [nil,nil,nil]
53
+ # last_update = Update.order("id DESC").limit(1).first
54
+ last_update = Update.find(:first, :order => "id DESC", :limit => 1)
55
+
56
+ # see if the last_line_parsed parsed exists in the current log file
57
+ # if it doesnt exist, we'll simply read anything with a timestamp greater than last_pageview_at
58
+ if last_update
59
+ last_pageview_at, last_line_parsed = last_update.last_pageview_at, last_update.last_line_parsed
60
+ file_reader do |line, lineno|
61
+ if line == last_line_parsed
62
+ lineno_of_last_line_parsed = lineno
63
+ break
64
+ end
65
+ end
66
+ puts "last line parsed was\n#{last_line_parsed}\nat lineno #{lineno_of_last_line_parsed}"
67
+ end
68
+
69
+ realtime = Benchmark.realtime do
70
+ date_path_pairs_array = []
71
+ lineno = -1
72
+
73
+ file_reader do |line, index|
74
+ lineno += 1
75
+ next if lineno_of_last_line_parsed && lineno < lineno_of_last_line_parsed
76
+
77
+ path_match = line[@path_regexp, 1]
78
+ next if path_match.nil?
79
+ date_match = line[@date_regexp, 1]
80
+ next if date_match.nil?
81
+ time_object = parse_string_as_date(date_match)
82
+
83
+ next if lineno_of_last_line_parsed.nil? && !last_pageview_at.nil? && time_object < last_pageview_at
84
+
85
+ insert_or_increment([time_object,path_match])
86
+ last_line_parsed = line
87
+ lines_parsed += 1
88
+ end
89
+ end
90
+
91
+ puts "completed parsing in #{realtime}"
92
+
93
+ persisted = 0
94
+ realtime = Benchmark.realtime do
95
+ hash_of_dates.each do |date, hash_of_paths|
96
+ hash_of_paths.keys.each do |path|
97
+ pv = Pageview.find_or_create_by_date_and_path(date, path)
98
+ pv.pageview_count ||= 0
99
+ pv.pageview_count += hash_of_paths[path]
100
+ pv.save!
101
+ persisted += 1
102
+ end
103
+ end
104
+ end
105
+
106
+ puts "completed persistence in #{realtime}"
107
+
108
+ Update.create!({:last_pageview_at => self.last_pageview_at, :lines_parsed => lines_parsed, :last_line_parsed => last_line_parsed})
109
+
110
+ puts "total records in DB: #{Pageview.count}\nlines parsed this round: #{lines_parsed}\nlines persisted this round:#{persisted}\ntotal SkinnyJeans executions since inception: #{Update.count}"
111
+
112
+ return self
113
+
114
+ end
115
+
116
+ def file_reader
117
+ if @is_gzipped
118
+ lineno = 0
119
+ Zlib::GzipReader.open(@logfile_path){|line|yield([line.read,lineno]);lineno+=1}
120
+ else
121
+ File.new(@logfile_path, "r").each_with_index{|line, lineno| yield([line,lineno])}
122
+ end
123
+ end
124
+
125
+ def pageview;get_ar_class(Pageview);end
126
+ def update;get_ar_class(Update);end
127
+
128
+ def get_ar_class(klass)
129
+ begin;return(klass);rescue(ActiveRecord::ConnectionNotEstablished);prepare_db;end
130
+ end
131
+
132
+ private
133
+
134
+ # return a ruby Time object
135
+ def parse_string_as_date(date_string = "02/Oct/2010:11:17:44 -0700")
136
+ day,month,year,hour,minute,seconds,zone = date_string.scan(/(\d{1,2})\/(\w{3,5})\/(\d{4}):(\d\d):(\d\d):(\d\d)\s(-?\d{3,4})/).flatten
137
+ Time.parse("#{year}-#{month}-#{day} #{hour}:#{minute}:#{seconds} #{zone}")
138
+ end
139
+
140
+ def insert_or_increment(date_path_pair)
141
+ datetime, path = date_path_pair
142
+ date = datetime.strftime(("%Y-%m-%d"))
143
+ hash_of_dates[date] ||= {}
144
+ hash_of_dates[date][path] ||= 0
145
+ hash_of_dates[date][path] += 1
146
+ @last_pageview_at = datetime
147
+ end
148
+
149
+ class Pageview < ActiveRecord::Base
150
+ end
151
+ class Update < ActiveRecord::Base
152
+ end
153
+
154
+
155
+ end
156
+
157
+ # SkinnyJeans::execute(ARGV.first) if "#{$0}".gsub(/.*\//,"") == "skinny_jeans.rb"
@@ -0,0 +1,47 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{skinny_jeans}
8
+ s.version = "0.2.2"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Jonathan Otto"]
12
+ s.date = %q{2010-10-04}
13
+ s.email = %q{jonathan.otto@gmail.com}
14
+ s.extra_rdoc_files = [
15
+ "README.rdoc"
16
+ ]
17
+ s.files = [
18
+ ".gitignore",
19
+ "README.rdoc",
20
+ "Rakefile",
21
+ "VERSION",
22
+ "lib/skinny_jeans.rb",
23
+ "skinny_jeans.gemspec"
24
+ ]
25
+ s.homepage = %q{http://github.com/jotto/skinny_jeans}
26
+ s.rdoc_options = ["--charset=UTF-8"]
27
+ s.require_paths = ["lib"]
28
+ s.rubygems_version = %q{1.3.7}
29
+ s.summary = %q{Fast webserver log parser for persisting daily pageviews per path to sqlite}
30
+
31
+ if s.respond_to? :specification_version then
32
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
33
+ s.specification_version = 3
34
+
35
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
36
+ s.add_runtime_dependency(%q<sqlite3-ruby>, [">= 1.2.4"])
37
+ s.add_runtime_dependency(%q<activerecord>, [">= 2.3.8"])
38
+ else
39
+ s.add_dependency(%q<sqlite3-ruby>, [">= 1.2.4"])
40
+ s.add_dependency(%q<activerecord>, [">= 2.3.8"])
41
+ end
42
+ else
43
+ s.add_dependency(%q<sqlite3-ruby>, [">= 1.2.4"])
44
+ s.add_dependency(%q<activerecord>, [">= 2.3.8"])
45
+ end
46
+ end
47
+
metadata ADDED
@@ -0,0 +1,103 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: skinny_jeans
3
+ version: !ruby/object:Gem::Version
4
+ hash: 19
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 2
9
+ - 2
10
+ version: 0.2.2
11
+ platform: ruby
12
+ authors:
13
+ - Jonathan Otto
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-10-04 00:00:00 -05:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: sqlite3-ruby
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 23
30
+ segments:
31
+ - 1
32
+ - 2
33
+ - 4
34
+ version: 1.2.4
35
+ type: :runtime
36
+ version_requirements: *id001
37
+ - !ruby/object:Gem::Dependency
38
+ name: activerecord
39
+ prerelease: false
40
+ requirement: &id002 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ hash: 19
46
+ segments:
47
+ - 2
48
+ - 3
49
+ - 8
50
+ version: 2.3.8
51
+ type: :runtime
52
+ version_requirements: *id002
53
+ description:
54
+ email: jonathan.otto@gmail.com
55
+ executables: []
56
+
57
+ extensions: []
58
+
59
+ extra_rdoc_files:
60
+ - README.rdoc
61
+ files:
62
+ - .gitignore
63
+ - README.rdoc
64
+ - Rakefile
65
+ - VERSION
66
+ - lib/skinny_jeans.rb
67
+ - skinny_jeans.gemspec
68
+ has_rdoc: true
69
+ homepage: http://github.com/jotto/skinny_jeans
70
+ licenses: []
71
+
72
+ post_install_message:
73
+ rdoc_options:
74
+ - --charset=UTF-8
75
+ require_paths:
76
+ - lib
77
+ required_ruby_version: !ruby/object:Gem::Requirement
78
+ none: false
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ hash: 3
83
+ segments:
84
+ - 0
85
+ version: "0"
86
+ required_rubygems_version: !ruby/object:Gem::Requirement
87
+ none: false
88
+ requirements:
89
+ - - ">="
90
+ - !ruby/object:Gem::Version
91
+ hash: 3
92
+ segments:
93
+ - 0
94
+ version: "0"
95
+ requirements: []
96
+
97
+ rubyforge_project:
98
+ rubygems_version: 1.3.7
99
+ signing_key:
100
+ specification_version: 3
101
+ summary: Fast webserver log parser for persisting daily pageviews per path to sqlite
102
+ test_files: []
103
+