skinny-jeans 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,3 @@
1
+ *.db
2
+ *.log
3
+ pkg
data/README.rdoc ADDED
@@ -0,0 +1,37 @@
1
+ = SKINNY JEANS LOG PARSING WITH RUBY & SQLITE FOR HIPSTERS
2
+ http://img696.imageshack.us/img696/75/skinnys3.jpg
3
+
4
+ == example
5
+
6
+ * your log file has lines that look like
7
+
8
+ 0.0.0.0 - - [01/Oct/2010:00:00:00 -0700] "GET /posts/my-first-post HTTP/1.1" 200 1337 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-"
9
+ 0.0.0.0 - - [01/Oct/2010:00:00:01 -0700] "GET /posts/my-first-post HTTP/1.1" 200 1337 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-"
10
+ 0.0.0.0 - - [01/Oct/2010:00:00:03 -0700] "GET /posts/my-first-post HTTP/1.1" 200 1337 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-"
11
+ 0.0.0.0 - - [02/Oct/2010:00:00:03 -0700] "GET /posts/my-first-post HTTP/1.1" 200 1337 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-"
12
+
13
+ * then you get 1 SQL row that looks like:
14
+ 2010-10-01, my-first-post, 3
15
+ 2010-10-02, my-first-post, 1
16
+
17
+
18
+ == WHY?
19
+ * so you can query a database by date and path and get pageviews and have that data stored CHEAP
20
+ * because i couldn't find anything simpler and Google Analytics is limited to 10,000 API requests per day
21
+
22
+
23
+ == Usage
24
+ SkinnyJeans::execute(logfile_path = "access.log", sqlite_skinny_jeans = "sqlite_skinny_jeans.db", path_regexp = /\s\/posts\/(.*)\sHTTP/, date_regexp = /\[(\d.*\d)\]/)
25
+ 1. Parse oldest logs first, then run regularly against your main log, let logrotate handle the rest (skinny_jeans remembers where it left off)
26
+ 2. enjoy the skinny jeans
27
+
28
+
29
+ == WHAT IT DO
30
+ * it parses 100,000 lines in < 2.5 seconds
31
+ * persists 1,000 requests with 2 compound indexes in 10 seconds (using home_run c date parser, 15 seconds without home_run)
32
+
33
+ * parse a webserver's log file to aggregate paths by DAY with pageview counts
34
+
35
+ * creates sqlite database with columns: date, path, pageview_count
36
+
37
+ * ASSUMES reading log files in ascending order, keeps track of last line read so you could put it on a scheduler or cron job
data/Rakefile ADDED
@@ -0,0 +1,15 @@
1
+ require 'rake'
2
+ begin
3
+ require 'jeweler'
4
+ Jeweler::Tasks.new do |s|
5
+ s.name = "skinny-jeans"
6
+ s.summary = "Fast webserver log parser for persisting daily pageviews per path to sqlite"
7
+ s.email = "jonathan.otto@gmail.com"
8
+ s.homepage = "http://github.com/jotto/skinny-jeans"
9
+ s.authors = ["Jonathan Otto"]
10
+ s.add_dependency 'sqlite3-ruby', '>= 1.2.4'
11
+ s.add_dependency 'activerecord', '>= 3.0.0'
12
+ end
13
+ rescue LoadError
14
+ puts "Jeweler, or one of its dependencies, is not available. Install it with: sudo gem install jeweler"
15
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,128 @@
1
+ require 'time'
2
+ require 'benchmark'
3
+ require 'rubygems'
4
+ require 'sqlite3'
5
+ require 'active_record'
6
+ # require 'home_run'
7
+
8
+ class Pageview < ActiveRecord::Base
9
+ end
10
+ class Update < ActiveRecord::Base
11
+ end
12
+
13
+ class SkinnyJeans
14
+ class << self
15
+
16
+ def prepare_db(sqlite_db_path)
17
+ # create database if necessary
18
+ SQLite3::Database.new(sqlite_db_path)
19
+ ActiveRecord::Base.establish_connection(:adapter => 'sqlite3', :database => sqlite_db_path)
20
+ # create tables if necessary
21
+ if !Pageview.table_exists?
22
+ ActiveRecord::Base.connection.create_table(:pageviews) do |t|
23
+ t.column :date, :date
24
+ t.column :path, :string
25
+ t.column :pageview_count, :integer
26
+ end
27
+ # flow tight like skinny jeans with these compound indexes
28
+ ActiveRecord::Base.connection.add_index(:pageviews, [:date, :path], :name => "date_path_index")
29
+ ActiveRecord::Base.connection.add_index(:pageviews, [:date, :pageview_count], :name => "date_pageview_count_index")
30
+ end
31
+ if !Update.table_exists?
32
+ ActiveRecord::Base.connection.create_table(:updates) do |t|
33
+ t.column :last_pageview_at, :timestamp
34
+ t.column :lines_parsed, :integer
35
+ t.column :last_line_parsed, :string
36
+ end
37
+ end
38
+ end
39
+
40
+ def execute(logfile_path, sqlite_db_path, path_regexp, date_regexp)
41
+
42
+ prepare_db(sqlite_db_path)
43
+ skinny_jean = self.new
44
+ lines_parsed = 0
45
+ last_line_parsed, last_pageview_at, lineno_of_last_line_parsed = [nil,nil,nil]
46
+ last_update = Update.order("id DESC").limit(1).first
47
+
48
+ if last_update
49
+ last_pageview_at, last_line_parsed = last_update.last_pageview_at, last_update.last_line_parsed
50
+ File.new(logfile_path, "r").each_with_index do |line, lineno|
51
+ if line == last_line_parsed
52
+ lineno_of_last_line_parsed = lineno
53
+ break
54
+ end
55
+ end
56
+ puts "last line parsed was\n#{last_line_parsed}\nat lineno #{lineno_of_last_line_parsed}"
57
+ end
58
+
59
+ realtime = Benchmark.realtime do
60
+ date_path_pairs_array = []
61
+
62
+ File.new(logfile_path, "r").each_with_index do |line, lineno|
63
+ next if lineno_of_last_line_parsed && lineno < lineno_of_last_line_parsed
64
+
65
+ path_match = line[path_regexp, 1]
66
+ next if path_match.nil?
67
+ date_match = line[date_regexp, 1]
68
+ next if date_match.nil?
69
+ time_object = parse_string_as_date(date_match)
70
+
71
+ next if lineno_of_last_line_parsed.nil? && !last_pageview_at.nil? && time_object < last_pageview_at
72
+
73
+ skinny_jean.insert_or_increment([time_object,path_match])
74
+ last_line_parsed = line
75
+ lines_parsed += 1
76
+ end
77
+
78
+ skinny_jean.hash_of_dates
79
+ end
80
+
81
+ puts "completed parsing in #{realtime}"
82
+
83
+ realtime = Benchmark.realtime do
84
+ skinny_jean.hash_of_dates.each do |date, hash_of_paths|
85
+ hash_of_paths.keys.each do |path|
86
+ pv = Pageview.find_or_create_by_date_and_path(date, path)
87
+ pv.pageview_count ||= 0
88
+ pv.pageview_count += hash_of_paths[path]
89
+ pv.save!
90
+ end
91
+ end
92
+ end
93
+
94
+ puts "completed persistence in #{realtime}"
95
+
96
+ Update.create!({:last_pageview_at => skinny_jean.last_pageview_at, :lines_parsed => lines_parsed, :last_line_parsed => last_line_parsed})
97
+
98
+ puts "total records in DB: #{Pageview.count}\nlines parsed this round: #{lines_parsed}\ntotal SkinnyJeans executions since inception: #{Update.count}"
99
+
100
+ end
101
+
102
+ # return a ruby Time object
103
+ def parse_string_as_date(date_string = "02/Oct/2010:11:17:44 -0700")
104
+ day,month,year,hour,minute,seconds,zone = date_string.scan(/(\d{1,2})\/(\w{3,5})\/(\d{4}):(\d\d):(\d\d):(\d\d)\s(-?\d{3,4})/).flatten
105
+ Time.parse("#{year}-#{month}-#{day} #{hour}:#{minute}:#{seconds} #{zone}")
106
+ end
107
+
108
+ end
109
+
110
+ attr_accessor :hash_of_dates, :last_pageview_at
111
+
112
+ def initialize
113
+ @hash_of_dates = {}
114
+ @last_datetime = nil
115
+ end
116
+
117
+ def insert_or_increment(date_path_pair)
118
+ datetime, path = date_path_pair
119
+ date = datetime.strftime(("%Y-%m-%d"))
120
+ hash_of_dates[date] ||= {}
121
+ hash_of_dates[date][path] ||= 0
122
+ hash_of_dates[date][path] += 1
123
+ @last_pageview_at = datetime
124
+ end
125
+
126
+ end
127
+
128
+ # SkinnyJeans::execute(ARGV.first) if "#{$0}".gsub(/.*\//,"") == "skinny_jeans.rb"
@@ -0,0 +1,47 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{skinny-jeans}
8
+ s.version = "0.1.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Jonathan Otto"]
12
+ s.date = %q{2010-10-03}
13
+ s.email = %q{jonathan.otto@gmail.com}
14
+ s.extra_rdoc_files = [
15
+ "README.rdoc"
16
+ ]
17
+ s.files = [
18
+ ".gitignore",
19
+ "README.rdoc",
20
+ "Rakefile",
21
+ "VERSION",
22
+ "lib/skinny_jeans.rb",
23
+ "skinny-jeans.gemspec"
24
+ ]
25
+ s.homepage = %q{http://github.com/jotto/skinny-jeans}
26
+ s.rdoc_options = ["--charset=UTF-8"]
27
+ s.require_paths = ["lib"]
28
+ s.rubygems_version = %q{1.3.7}
29
+ s.summary = %q{Fast webserver log parser for persisting daily pageviews per path to sqlite}
30
+
31
+ if s.respond_to? :specification_version then
32
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
33
+ s.specification_version = 3
34
+
35
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
36
+ s.add_runtime_dependency(%q<sqlite3-ruby>, [">= 1.2.4"])
37
+ s.add_runtime_dependency(%q<activerecord>, [">= 3.0.0"])
38
+ else
39
+ s.add_dependency(%q<sqlite3-ruby>, [">= 1.2.4"])
40
+ s.add_dependency(%q<activerecord>, [">= 3.0.0"])
41
+ end
42
+ else
43
+ s.add_dependency(%q<sqlite3-ruby>, [">= 1.2.4"])
44
+ s.add_dependency(%q<activerecord>, [">= 3.0.0"])
45
+ end
46
+ end
47
+
metadata ADDED
@@ -0,0 +1,103 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: skinny-jeans
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Jonathan Otto
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-10-03 00:00:00 -05:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: sqlite3-ruby
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 23
30
+ segments:
31
+ - 1
32
+ - 2
33
+ - 4
34
+ version: 1.2.4
35
+ type: :runtime
36
+ version_requirements: *id001
37
+ - !ruby/object:Gem::Dependency
38
+ name: activerecord
39
+ prerelease: false
40
+ requirement: &id002 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ hash: 7
46
+ segments:
47
+ - 3
48
+ - 0
49
+ - 0
50
+ version: 3.0.0
51
+ type: :runtime
52
+ version_requirements: *id002
53
+ description:
54
+ email: jonathan.otto@gmail.com
55
+ executables: []
56
+
57
+ extensions: []
58
+
59
+ extra_rdoc_files:
60
+ - README.rdoc
61
+ files:
62
+ - .gitignore
63
+ - README.rdoc
64
+ - Rakefile
65
+ - VERSION
66
+ - lib/skinny_jeans.rb
67
+ - skinny-jeans.gemspec
68
+ has_rdoc: true
69
+ homepage: http://github.com/jotto/skinny-jeans
70
+ licenses: []
71
+
72
+ post_install_message:
73
+ rdoc_options:
74
+ - --charset=UTF-8
75
+ require_paths:
76
+ - lib
77
+ required_ruby_version: !ruby/object:Gem::Requirement
78
+ none: false
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ hash: 3
83
+ segments:
84
+ - 0
85
+ version: "0"
86
+ required_rubygems_version: !ruby/object:Gem::Requirement
87
+ none: false
88
+ requirements:
89
+ - - ">="
90
+ - !ruby/object:Gem::Version
91
+ hash: 3
92
+ segments:
93
+ - 0
94
+ version: "0"
95
+ requirements: []
96
+
97
+ rubyforge_project:
98
+ rubygems_version: 1.3.7
99
+ signing_key:
100
+ specification_version: 3
101
+ summary: Fast webserver log parser for persisting daily pageviews per path to sqlite
102
+ test_files: []
103
+