skinny_jeans 0.8.1 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +7 -9
- data/Rakefile +3 -2
- data/VERSION +1 -1
- data/lib/skinny_jeans/string_parser.rb +2 -7
- data/lib/skinny_jeans.rb +30 -1
- data/skinny_jeans.gemspec +10 -7
- data/test/skinny_jeans_string_parser_test.rb +1 -0
- data/test/skinny_jeans_test.rb +1 -0
- metadata +24 -9
data/README.rdoc
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
= SKINNY JEANS LOG PARSING WITH RUBY & SQLITE FOR HIPSTERS
|
1
|
+
= SKINNY JEANS FAST LOG PARSING WITH RUBY & SQLITE FOR HIPSTERS
|
2
2
|
http://img696.imageshack.us/img696/75/skinnys3.jpg
|
3
3
|
|
4
4
|
== EXAMPLE
|
@@ -30,24 +30,22 @@ http://img696.imageshack.us/img696/75/skinnys3.jpg
|
|
30
30
|
sj = SkinnyJeans::LogParser::execute(logfile_path = "access.log", sqlite_skinny_jeans = "sqlite_skinny_jeans.db", path_regexp = /\s\/posts\/(.*)\sHTTP/, date_regexp = /\[(\d.*\d)\]/)
|
31
31
|
sj.pageview.where("date = '2010-10-01' and path = 'my-first-post'")
|
32
32
|
=> #<SkinnyJeans::Pageview id: 1, date: "2010-10-01", path: "my-first-post", pageview_count: 3>
|
33
|
-
1. NOTE: for now **you have to monkey patch the SkinnyJeans#parse_string_as_date**
|
33
|
+
1. NOTE: for now **you may have to monkey patch the SkinnyJeans#parse_string_as_date**
|
34
34
|
2. Parse oldest logs first, then run regularly against your main log, let logrotate handle the rest (skinny_jeans remembers where it left off)
|
35
35
|
3. ASSUMES reading log files in ascending order, keeps track of last line read so you could put it on a scheduler or cron job
|
36
|
-
4. access the 2 activerecord classes, sj.pageview (returns Pageview class), and sj.update
|
36
|
+
4. access the 2 activerecord classes, sj.pageview (returns Pageview class), and sj.update happily within your own app
|
37
37
|
5. enjoy the skinny jeans
|
38
38
|
|
39
39
|
|
40
|
-
== PERFORMANCE
|
41
|
-
* it parses
|
42
|
-
* persists
|
43
|
-
* 25,000 rows == 4 megabyte sqlite database
|
40
|
+
== PERFORMANCE
|
41
|
+
* it parses 300,000 lines in < 9 seconds and
|
42
|
+
* persists 6,000 requests (in 2 different tables, one of the tables is for search engine referrals) with 2 compound indexes in 18 seconds
|
44
43
|
|
45
44
|
== NOTES
|
46
45
|
* supports gzipped files
|
47
46
|
* creates a temp copy of the log file before parsing
|
48
47
|
* currently requires each line to be unique
|
49
|
-
|
50
|
-
* this is only a problem if the last line parsed is one of the pages that was hit by the same client more than once in less than one second
|
48
|
+
* line uniqueness is potential problem if a single client manages to hit the same page more than once in less than 1 second AND it's the last line of a log file
|
51
49
|
|
52
50
|
== LICENSE
|
53
51
|
|
data/Rakefile
CHANGED
@@ -8,9 +8,10 @@ begin
|
|
8
8
|
s.email = "jonathan.otto@gmail.com"
|
9
9
|
s.homepage = "http://github.com/jotto/skinny_jeans"
|
10
10
|
s.authors = ["Jonathan Otto"]
|
11
|
-
s.add_dependency 'sqlite3-ruby', '>= 1.
|
12
|
-
s.add_dependency 'activerecord', '>=
|
11
|
+
s.add_dependency 'sqlite3-ruby', '>= 1.3.3'
|
12
|
+
s.add_dependency 'activerecord', '>= 3.0.0'
|
13
13
|
s.add_dependency 'spinner', '>= 1.0.0'
|
14
|
+
s.add_dependency 'home_run', '>= 1.0.1'
|
14
15
|
end
|
15
16
|
rescue LoadError
|
16
17
|
puts "Jeweler, or one of its dependencies, is not available. Install it with: sudo gem install jeweler"
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.9.0
|
@@ -29,7 +29,7 @@ module SkinnyJeans
|
|
29
29
|
val=return_param_from_valid_url_or_path(url,"q")
|
30
30
|
end
|
31
31
|
# whitelist of acceptable characters
|
32
|
-
val = val
|
32
|
+
val = !!val && val.gsub(/[^0-9A-Za-z\s"'!@#\$%\^&\*\(\)\?\<\>\[\]:;,\.+-_=]/, '') != val ? nil : val
|
33
33
|
return val
|
34
34
|
end
|
35
35
|
|
@@ -40,18 +40,13 @@ module SkinnyJeans
|
|
40
40
|
if _uri.query.present?
|
41
41
|
_cgi = CGI.parse(_uri.query)
|
42
42
|
if _cgi[param_name]
|
43
|
-
val =
|
43
|
+
val = URI.decode(_cgi[param_name].join).strip.downcase
|
44
44
|
return (!val.nil? && val!='' ? val : nil)
|
45
45
|
end
|
46
46
|
end
|
47
47
|
return nil
|
48
48
|
end
|
49
49
|
|
50
|
-
def unescape_string(_string)
|
51
|
-
temp = _string.dup
|
52
|
-
temp = CGI.unescape(temp) while CGI.unescape(temp) != temp
|
53
|
-
temp
|
54
|
-
end
|
55
50
|
end
|
56
51
|
|
57
52
|
attr_accessor :string_value
|
data/lib/skinny_jeans.rb
CHANGED
@@ -8,9 +8,16 @@ require 'rubygems'
|
|
8
8
|
require 'active_record'
|
9
9
|
require 'sqlite3'
|
10
10
|
require 'spinner'
|
11
|
+
|
12
|
+
# faster URI decoding (neglible savings)
|
13
|
+
# leaving comment for reference
|
14
|
+
# require 'escape_utils'
|
15
|
+
|
11
16
|
require File.expand_path(File.dirname(__FILE__) + "/skinny_jeans/string_parser")
|
12
17
|
require File.expand_path(File.dirname(__FILE__) + "/skinny_jeans/log_parser")
|
13
|
-
|
18
|
+
|
19
|
+
# faster date parsing (about a 17% speed boost)
|
20
|
+
require 'home_run'
|
14
21
|
|
15
22
|
|
16
23
|
# SkinnyJeans::execute(ARGV.first) if "#{$0}".gsub(/.*\//,"") == "skinny_jeans.rb"
|
@@ -64,3 +71,25 @@ module SkinnyJeans
|
|
64
71
|
|
65
72
|
end
|
66
73
|
|
74
|
+
# class CGI
|
75
|
+
# # @@accept_charset="UTF-8" unless defined?(@@accept_charset)
|
76
|
+
# # # URL-encode a string.
|
77
|
+
# # # url_encoded_string = CGI::escape("'Stop!' said Fred")
|
78
|
+
# # # # => "%27Stop%21%27+said+Fred"
|
79
|
+
# # def CGI::escape(string)
|
80
|
+
# # string.gsub(/([^ a-zA-Z0-9_.-]+)/) do
|
81
|
+
# # '%' + $1.unpack('H2' * $1.bytesize).join('%').upcase
|
82
|
+
# # end.tr(' ', '+')
|
83
|
+
# # end
|
84
|
+
#
|
85
|
+
#
|
86
|
+
# # URL-decode a string with encoding(optional).
|
87
|
+
# # string = CGI::unescape("%27Stop%21%27+said+Fred")
|
88
|
+
# # # => "'Stop!' said Fred"
|
89
|
+
# def CGI::unescape(string,encoding=@@accept_charset)
|
90
|
+
# str=string.tr('+', ' ').force_encoding(Encoding::ASCII_8BIT).gsub(/((?:%[0-9a-fA-F]{2})+)/u) do
|
91
|
+
# [$1.delete('%')].pack('H*')
|
92
|
+
# end.force_encoding(encoding)
|
93
|
+
# str.valid_encoding? ? str : str.force_encoding(string.encoding)
|
94
|
+
# end
|
95
|
+
# end
|
data/skinny_jeans.gemspec
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{skinny_jeans}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.9.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Jonathan Otto"]
|
@@ -40,18 +40,21 @@ Gem::Specification.new do |s|
|
|
40
40
|
s.specification_version = 3
|
41
41
|
|
42
42
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
43
|
-
s.add_runtime_dependency(%q<sqlite3-ruby>, [">= 1.
|
44
|
-
s.add_runtime_dependency(%q<activerecord>, [">=
|
43
|
+
s.add_runtime_dependency(%q<sqlite3-ruby>, [">= 1.3.3"])
|
44
|
+
s.add_runtime_dependency(%q<activerecord>, [">= 3.0.0"])
|
45
45
|
s.add_runtime_dependency(%q<spinner>, [">= 1.0.0"])
|
46
|
+
s.add_runtime_dependency(%q<home_run>, [">= 1.0.1"])
|
46
47
|
else
|
47
|
-
s.add_dependency(%q<sqlite3-ruby>, [">= 1.
|
48
|
-
s.add_dependency(%q<activerecord>, [">=
|
48
|
+
s.add_dependency(%q<sqlite3-ruby>, [">= 1.3.3"])
|
49
|
+
s.add_dependency(%q<activerecord>, [">= 3.0.0"])
|
49
50
|
s.add_dependency(%q<spinner>, [">= 1.0.0"])
|
51
|
+
s.add_dependency(%q<home_run>, [">= 1.0.1"])
|
50
52
|
end
|
51
53
|
else
|
52
|
-
s.add_dependency(%q<sqlite3-ruby>, [">= 1.
|
53
|
-
s.add_dependency(%q<activerecord>, [">=
|
54
|
+
s.add_dependency(%q<sqlite3-ruby>, [">= 1.3.3"])
|
55
|
+
s.add_dependency(%q<activerecord>, [">= 3.0.0"])
|
54
56
|
s.add_dependency(%q<spinner>, [">= 1.0.0"])
|
57
|
+
s.add_dependency(%q<home_run>, [">= 1.0.1"])
|
55
58
|
end
|
56
59
|
end
|
57
60
|
|
@@ -65,6 +65,7 @@ class SkinnyJeans::StringParserTest < Test::Unit::TestCase
|
|
65
65
|
|
66
66
|
assert_equal "\"dealzon\"", SkinnyJeans::StringParser.extract_search_query("http://www.google.com/search?client=safari&rls=en&q=%22dealzon%22&ie=UTF-8&oe=UTF-8")
|
67
67
|
assert_equal "\'dealzon\'", SkinnyJeans::StringParser.extract_search_query("http://www.google.com/search?client=safari&rls=en&q='dealzon'&ie=UTF-8&oe=UTF-8")
|
68
|
+
assert_equal "legos on sale %80 off", SkinnyJeans::StringParser::extract_search_query("http://www.google.com/url?sa=t&source=web&cd=8&ved=0CF0QFjAH&url=http%3A%2F%2Fdealzon.com%2Fdeals%2Flego-mindstorms-nxt-robotics-micro-controller-brick-black&rct=j&q=legos%20on%20sale%20%2580%20off&ei=klp9TdyXEPC10QHMpujiAw&usg=AFQjCNFksmyHsu8gLq436-GpHmeu-OAcWQ")
|
68
69
|
end
|
69
70
|
|
70
71
|
end
|
data/test/skinny_jeans_test.rb
CHANGED
@@ -9,6 +9,7 @@ class SkinnyJeansTest < Test::Unit::TestCase
|
|
9
9
|
def test_will_work
|
10
10
|
db_path = File.expand_path(File.dirname(__FILE__) + "/more_sample_data/skinny_jeans_deal_show.db")
|
11
11
|
FileUtils.rm(db_path) if File.exists?(db_path)
|
12
|
+
# broken_access_big.log is 337,925 lines
|
12
13
|
_logfile_path = File.expand_path(File.dirname(__FILE__) + "/more_sample_data/broken_access_big.log")
|
13
14
|
puts db_path.inspect
|
14
15
|
sj=SkinnyJeans::LogParser.execute(_logfile_path, sqlite_skinny_jeans = db_path, path_regexp = /\s\/deals\/(.*)\sHTTP/, date_regexp = /\[(\d.*\d)\]/)
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
7
|
+
- 9
|
8
|
+
- 0
|
9
|
+
version: 0.9.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jonathan Otto
|
@@ -27,9 +27,9 @@ dependencies:
|
|
27
27
|
- !ruby/object:Gem::Version
|
28
28
|
segments:
|
29
29
|
- 1
|
30
|
-
-
|
31
|
-
-
|
32
|
-
version: 1.
|
30
|
+
- 3
|
31
|
+
- 3
|
32
|
+
version: 1.3.3
|
33
33
|
type: :runtime
|
34
34
|
version_requirements: *id001
|
35
35
|
- !ruby/object:Gem::Dependency
|
@@ -41,10 +41,10 @@ dependencies:
|
|
41
41
|
- - ">="
|
42
42
|
- !ruby/object:Gem::Version
|
43
43
|
segments:
|
44
|
-
- 2
|
45
44
|
- 3
|
46
|
-
-
|
47
|
-
|
45
|
+
- 0
|
46
|
+
- 0
|
47
|
+
version: 3.0.0
|
48
48
|
type: :runtime
|
49
49
|
version_requirements: *id002
|
50
50
|
- !ruby/object:Gem::Dependency
|
@@ -62,6 +62,21 @@ dependencies:
|
|
62
62
|
version: 1.0.0
|
63
63
|
type: :runtime
|
64
64
|
version_requirements: *id003
|
65
|
+
- !ruby/object:Gem::Dependency
|
66
|
+
name: home_run
|
67
|
+
prerelease: false
|
68
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
69
|
+
none: false
|
70
|
+
requirements:
|
71
|
+
- - ">="
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
segments:
|
74
|
+
- 1
|
75
|
+
- 0
|
76
|
+
- 1
|
77
|
+
version: 1.0.1
|
78
|
+
type: :runtime
|
79
|
+
version_requirements: *id004
|
65
80
|
description: Fast webserver log parser for persisting daily pageviews per path to sqlite
|
66
81
|
email: jonathan.otto@gmail.com
|
67
82
|
executables: []
|