skinny_jeans 0.5.2 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/skinny_jeans/log_parser.rb +164 -0
- data/lib/skinny_jeans/string_parser.rb +72 -0
- data/lib/skinny_jeans.rb +39 -193
- data/skinny_jeans.gemspec +4 -3
- data/test/skinny_jeans_string_parser_test.rb +27 -7
- data/test/skinny_jeans_test.rb +11 -6
- metadata +7 -6
- data/lib/skinny_jeans_string_parser.rb +0 -70
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.6.0
|
@@ -0,0 +1,164 @@
|
|
1
|
+
module SkinnyJeans
|
2
|
+
|
3
|
+
class LogParser
|
4
|
+
|
5
|
+
def self.execute(logfile_path, sqlite_db_path, path_regexp, date_regexp)
|
6
|
+
self.new(logfile_path, sqlite_db_path, path_regexp, date_regexp).execute
|
7
|
+
end
|
8
|
+
|
9
|
+
attr_accessor :hash_of_dates, :hash_of_dates_for_keywords, :last_pageview_at
|
10
|
+
|
11
|
+
def initialize(logfile_path, sqlite_db_path, path_regexp, date_regexp)
|
12
|
+
@logfile_path, @sqlite_db_path, @path_regexp, @date_regexp = [logfile_path, sqlite_db_path, path_regexp, date_regexp]
|
13
|
+
@is_gzipped = !logfile_path.to_s[/gz/].nil?
|
14
|
+
SkinnyJeans::prepare_db(@sqlite_db_path)
|
15
|
+
@hash_of_dates = {}
|
16
|
+
@hash_of_dates_for_keywords = {}
|
17
|
+
@last_datetime = nil
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
def execute
|
22
|
+
|
23
|
+
lines_parsed = 0
|
24
|
+
last_line_parsed, last_pageview_at, lineno_of_last_line_parsed = [nil,nil,nil]
|
25
|
+
# last_update = Update.order("id DESC").limit(1).first
|
26
|
+
last_update = Update.find(:first, :order => "id DESC", :limit => 1)
|
27
|
+
|
28
|
+
# see if the last_line_parsed parsed exists in the current log file
|
29
|
+
# if it doesnt exist, we'll simply read anything with a timestamp greater than last_pageview_at
|
30
|
+
if last_update
|
31
|
+
last_pageview_at, last_line_parsed = last_update.last_pageview_at, last_update.last_line_parsed
|
32
|
+
file_reader do |line, lineno|
|
33
|
+
if line.to_s[0..254] == last_line_parsed.to_s[0..254]
|
34
|
+
lineno_of_last_line_parsed = lineno
|
35
|
+
break
|
36
|
+
end
|
37
|
+
end
|
38
|
+
puts "last line parsed was\n#{last_line_parsed}\nat lineno #{lineno_of_last_line_parsed}"
|
39
|
+
end
|
40
|
+
|
41
|
+
realtime = Benchmark.realtime do
|
42
|
+
date_path_pairs_array = []
|
43
|
+
lineno = -1
|
44
|
+
|
45
|
+
file_reader do |line, index|
|
46
|
+
lineno += 1
|
47
|
+
next if lineno_of_last_line_parsed && lineno <= lineno_of_last_line_parsed
|
48
|
+
|
49
|
+
path_match = line[@path_regexp, 1]
|
50
|
+
next if path_match.nil?
|
51
|
+
date_match = line[@date_regexp, 1]
|
52
|
+
next if date_match.nil?
|
53
|
+
datetime_obj = parse_string_as_date(date_match)
|
54
|
+
|
55
|
+
next if lineno_of_last_line_parsed.nil? && !last_pageview_at.nil? && datetime_obj < last_pageview_at
|
56
|
+
|
57
|
+
insert_or_increment(datetime_obj, path_match, SkinnyJeans::StringParser.extract_search_query(line))
|
58
|
+
@last_pageview_at = datetime_obj
|
59
|
+
last_line_parsed = line.to_s[0..254] # only 255 characters because we store it in the database
|
60
|
+
lines_parsed += 1
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
puts "completed parsing in #{realtime}"
|
65
|
+
|
66
|
+
persisted = 0
|
67
|
+
persisted_pageview_keywords = 0
|
68
|
+
realtime = Benchmark.realtime do
|
69
|
+
|
70
|
+
hash_of_dates.each do |date, hash_of_paths|
|
71
|
+
hash_of_paths.keys.each do |path|
|
72
|
+
pv = Pageview.find_or_create_by_date_and_path(date, path)
|
73
|
+
pv.pageview_count ||= 0
|
74
|
+
pv.pageview_count += hash_of_paths[path]
|
75
|
+
pv.save!
|
76
|
+
persisted += 1
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
hash_of_dates_for_keywords.each do |date, hash_of_paths|
|
81
|
+
hash_of_paths.keys.each do |path|
|
82
|
+
hash_of_paths[path].keys.each do |keyword|
|
83
|
+
pvk = PageviewKeyword.find_or_create_by_date_and_path_and_keyword(date, path, keyword)
|
84
|
+
pvk.keyword = keyword.to_s[0..254]
|
85
|
+
pvk.pageview_count ||= 0
|
86
|
+
pvk.pageview_count += hash_of_paths[path][keyword]
|
87
|
+
pvk.save!
|
88
|
+
persisted_pageview_keywords += 1
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
end
|
94
|
+
|
95
|
+
puts "completed persistence in #{realtime}"
|
96
|
+
|
97
|
+
Update.create!({:last_pageview_at => self.last_pageview_at, :lines_parsed => lines_parsed, :last_line_parsed => last_line_parsed.to_s[0..254]})
|
98
|
+
|
99
|
+
puts("total records in DB: #{Pageview.count}
|
100
|
+
lines parsed this round: #{lines_parsed}
|
101
|
+
lines persisted this round:#{persisted}
|
102
|
+
total SkinnyJeans executions since inception: #{Update.count}")
|
103
|
+
|
104
|
+
return self
|
105
|
+
|
106
|
+
end
|
107
|
+
|
108
|
+
# copies the log file, reads it, then removes it
|
109
|
+
def file_reader
|
110
|
+
|
111
|
+
temp_file_path = "#{@logfile_path}.copy"
|
112
|
+
temp_file = FileUtils.cp(@logfile_path, temp_file_path)
|
113
|
+
|
114
|
+
if @is_gzipped
|
115
|
+
lineno = 0
|
116
|
+
Zlib::GzipReader.new(File.new(temp_file_path, "r")).each_line{|line|yield([line,lineno]);lineno+=1}
|
117
|
+
# Zlib::GzipReader.open(@logfile_path).each_line{|line|yield([line,lineno]);lineno+=1}
|
118
|
+
else
|
119
|
+
File.new(temp_file_path, "r").each_with_index{|line, lineno| yield([line,lineno])}
|
120
|
+
end
|
121
|
+
|
122
|
+
FileUtils.rm_f(temp_file_path)
|
123
|
+
end
|
124
|
+
|
125
|
+
def pageview;get_ar_class(Pageview);end
|
126
|
+
def update;get_ar_class(Update);end
|
127
|
+
def pageview_keyword;get_ar_class(PageviewKeyword);end
|
128
|
+
|
129
|
+
def get_ar_class(klass)
|
130
|
+
begin;return(klass);rescue(ActiveRecord::ConnectionNotEstablished);SkinnyJeans::prepare_db(@sqlite_db_path);end
|
131
|
+
end
|
132
|
+
|
133
|
+
private
|
134
|
+
|
135
|
+
# return a ruby Time object
|
136
|
+
def parse_string_as_date(date_string = "02/Oct/2010:11:17:44 -0700")
|
137
|
+
day,month,year,hour,minute,seconds,zone = date_string.scan(/(\d{1,2})\/(\w{3,5})\/(\d{4}):(\d\d):(\d\d):(\d\d)\s(-?\d{3,4})/).flatten
|
138
|
+
Time.parse("#{year}-#{month}-#{day} #{hour}:#{minute}:#{seconds} #{zone}")
|
139
|
+
end
|
140
|
+
|
141
|
+
def insert_or_increment(_datetime_obj, _path, _search_keyword = nil)
|
142
|
+
|
143
|
+
date_string = _datetime_obj.strftime(("%Y-%m-%d"))
|
144
|
+
|
145
|
+
# data for all pageviews
|
146
|
+
hash_of_dates[date_string] ||= {}
|
147
|
+
hash_of_dates[date_string][_path] ||= 0
|
148
|
+
hash_of_dates[date_string][_path] += 1
|
149
|
+
|
150
|
+
return if _search_keyword.nil?
|
151
|
+
|
152
|
+
# data for just pageviews coming from search
|
153
|
+
hash_of_dates_for_keywords[date_string] ||= {}
|
154
|
+
hash_of_dates_for_keywords[date_string][_path] ||= {}
|
155
|
+
hash_of_dates_for_keywords[date_string][_path][_search_keyword] ||= 0
|
156
|
+
hash_of_dates_for_keywords[date_string][_path][_search_keyword] += 1
|
157
|
+
|
158
|
+
end
|
159
|
+
|
160
|
+
|
161
|
+
|
162
|
+
end
|
163
|
+
|
164
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
# example
|
2
|
+
# SkinnyJeans::StringParser.extract_search_query("http://search.aol.com/aol/search?enabled_terms=&s_it=comsearch50&q=cool+stuff")
|
3
|
+
# => "cool stuff"
|
4
|
+
|
5
|
+
module SkinnyJeans
|
6
|
+
class StringParser
|
7
|
+
|
8
|
+
class << self
|
9
|
+
def extract_search_query(_url)
|
10
|
+
self.new(_url).get_search_keyword
|
11
|
+
end
|
12
|
+
|
13
|
+
# pre: some referring URL from google, yahoo, AOL, bing, ask
|
14
|
+
# post: whatever the search query was, ASCII or GTFO
|
15
|
+
def extract_search_query_from_valid_url(url)
|
16
|
+
val = nil
|
17
|
+
case url
|
18
|
+
when /google\.com/
|
19
|
+
val=return_param_from_valid_url_or_path(url,"q")
|
20
|
+
when /search\.yahoo\.com/
|
21
|
+
val=return_param_from_valid_url_or_path(url,"p")
|
22
|
+
when /search\.aol\.com/
|
23
|
+
val=return_param_from_valid_url_or_path(url,"q")
|
24
|
+
when /ask\.com/
|
25
|
+
val=return_param_from_valid_url_or_path(url,"q")
|
26
|
+
when /bing\.com/
|
27
|
+
val=return_param_from_valid_url_or_path(url,"q")
|
28
|
+
end
|
29
|
+
# whitelist of acceptable characters
|
30
|
+
val = val.present? && val.gsub(/[^0-9A-Za-z\s"'!@#\$%\^&\*\(\)\?\<\>\[\]:;,\.+-_=]/, '') != val ? nil : val
|
31
|
+
return val
|
32
|
+
end
|
33
|
+
|
34
|
+
# pre: like http://example.org?q=cool&fun=no, "fun"
|
35
|
+
# post: "no"
|
36
|
+
def return_param_from_valid_url_or_path(url_or_path, param_name)
|
37
|
+
_uri = URI.parse(URI.encode(url_or_path))
|
38
|
+
if _uri.query.present?
|
39
|
+
_cgi = CGI.parse(_uri.query)
|
40
|
+
if _cgi[param_name]
|
41
|
+
val = unescape_string(_cgi[param_name].join).strip.downcase
|
42
|
+
return (!val.nil? && val!='' ? val : nil)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
return nil
|
46
|
+
end
|
47
|
+
|
48
|
+
def unescape_string(_string)
|
49
|
+
temp = _string.dup
|
50
|
+
temp = CGI.unescape(temp) while CGI.unescape(temp) != temp
|
51
|
+
temp
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
attr_accessor :string_value
|
56
|
+
def initialize(string_value)
|
57
|
+
@string_value = string_value
|
58
|
+
end
|
59
|
+
|
60
|
+
# find all URLs in a string that are at beginning or end of string or are tokenized by spaces
|
61
|
+
def all_urls
|
62
|
+
@all_urls ||= string_value.split(/\s+/).reject { |_string| !_string.match(/^['"]?https?:['"]?/) }.collect { |url| url.gsub(/["']/,'') }
|
63
|
+
@all_urls.empty? ? nil : @all_urls
|
64
|
+
end
|
65
|
+
|
66
|
+
# iterate through any URLs we find in a string and return a search query or nil
|
67
|
+
def get_search_keyword
|
68
|
+
!all_urls.nil? ? all_urls.collect { |_url| self.class.extract_search_query_from_valid_url(_url) }[0] : nil
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
72
|
+
end
|
data/lib/skinny_jeans.rb
CHANGED
@@ -1,219 +1,65 @@
|
|
1
1
|
require 'time'
|
2
2
|
require 'benchmark'
|
3
|
-
require 'rubygems'
|
4
|
-
require 'sqlite3'
|
5
|
-
require 'active_record'
|
6
3
|
require 'zlib'
|
7
4
|
require 'fileutils'
|
8
5
|
require 'uri'
|
9
6
|
require 'cgi'
|
10
|
-
require '
|
7
|
+
require 'rubygems'
|
8
|
+
require 'active_record'
|
9
|
+
require 'sqlite3'
|
10
|
+
require File.expand_path(File.dirname(__FILE__) + "/skinny_jeans/string_parser")
|
11
|
+
require File.expand_path(File.dirname(__FILE__) + "/skinny_jeans/log_parser")
|
11
12
|
# require 'home_run'
|
12
13
|
|
13
|
-
class SkinnyJeans
|
14
|
-
|
15
|
-
def self.execute(logfile_path, sqlite_db_path, path_regexp, date_regexp)
|
16
|
-
self.new(logfile_path, sqlite_db_path, path_regexp, date_regexp).execute
|
17
|
-
end
|
18
|
-
|
19
|
-
attr_accessor :hash_of_dates, :hash_of_dates_for_keywords, :last_pageview_at
|
20
|
-
|
21
|
-
def initialize(logfile_path, sqlite_db_path, path_regexp, date_regexp)
|
22
|
-
@logfile_path, @sqlite_db_path, @path_regexp, @date_regexp = [logfile_path, sqlite_db_path, path_regexp, date_regexp]
|
23
|
-
@is_gzipped = !logfile_path.to_s[/gz/].nil?
|
24
|
-
prepare_db
|
25
|
-
@hash_of_dates = {}
|
26
|
-
@hash_of_dates_for_keywords = {}
|
27
|
-
@last_datetime = nil
|
28
|
-
end
|
29
14
|
|
15
|
+
# SkinnyJeans::execute(ARGV.first) if "#{$0}".gsub(/.*\//,"") == "skinny_jeans.rb"
|
16
|
+
module SkinnyJeans
|
30
17
|
class SkinnyJeanDb < ActiveRecord::Base
|
31
18
|
self.abstract_class = true
|
32
19
|
end
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
end
|
49
|
-
if !Update.table_exists?
|
50
|
-
SkinnyJeanDb.connection.create_table(:updates) do |t|
|
51
|
-
t.column :last_pageview_at, :timestamp
|
52
|
-
t.column :lines_parsed, :integer
|
53
|
-
t.column :last_line_parsed, :string
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
# addition from 2010-12-06 to track search traffic specifically
|
58
|
-
if !PageviewKeyword.table_exists?
|
59
|
-
SkinnyJeanDb.connection.create_table(:pageview_keywords) do |t|
|
60
|
-
t.column :date, :date
|
61
|
-
t.column :path, :string
|
62
|
-
t.column :pageview_count, :integer
|
63
|
-
t.column :keyword, :string
|
64
|
-
end
|
65
|
-
SkinnyJeanDb.connection.add_index(:pageview_keywords, [:date, :path, :keyword], :name => "date_path_keyword_index")
|
66
|
-
# SkinnyJeanDb.connection.add_index(:pageview_keywords, [:date, :pageview_count], :name => "date_pageview_count_index")
|
67
|
-
end
|
68
|
-
|
69
|
-
end
|
70
|
-
|
71
|
-
def execute
|
72
|
-
|
73
|
-
lines_parsed = 0
|
74
|
-
last_line_parsed, last_pageview_at, lineno_of_last_line_parsed = [nil,nil,nil]
|
75
|
-
# last_update = Update.order("id DESC").limit(1).first
|
76
|
-
last_update = Update.find(:first, :order => "id DESC", :limit => 1)
|
77
|
-
|
78
|
-
# see if the last_line_parsed parsed exists in the current log file
|
79
|
-
# if it doesnt exist, we'll simply read anything with a timestamp greater than last_pageview_at
|
80
|
-
if last_update
|
81
|
-
last_pageview_at, last_line_parsed = last_update.last_pageview_at, last_update.last_line_parsed
|
82
|
-
file_reader do |line, lineno|
|
83
|
-
if line.to_s[0..254] == last_line_parsed.to_s[0..254]
|
84
|
-
lineno_of_last_line_parsed = lineno
|
85
|
-
break
|
20
|
+
class Pageview < SkinnyJeanDb;end
|
21
|
+
class PageviewKeyword < SkinnyJeanDb;end
|
22
|
+
class Update < SkinnyJeanDb;end
|
23
|
+
|
24
|
+
class << self
|
25
|
+
def prepare_db(sqlite_db_path)
|
26
|
+
# create database if necessary
|
27
|
+
SQLite3::Database.new(sqlite_db_path)
|
28
|
+
SkinnyJeanDb.establish_connection(:adapter => 'sqlite3', :database => sqlite_db_path)
|
29
|
+
# create tables if necessary
|
30
|
+
if !Pageview.table_exists?
|
31
|
+
SkinnyJeanDb.connection.create_table(:pageviews) do |t|
|
32
|
+
t.column :date, :date
|
33
|
+
t.column :path, :string
|
34
|
+
t.column :pageview_count, :integer
|
86
35
|
end
|
36
|
+
# flow tight like skinny jeans with these compound indexes
|
37
|
+
SkinnyJeanDb.connection.add_index(:pageviews, [:date, :path], :name => "date_path_index")
|
38
|
+
SkinnyJeanDb.connection.add_index(:pageviews, [:date, :pageview_count], :name => "date_pageview_count_index")
|
87
39
|
end
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
lineno = -1
|
94
|
-
|
95
|
-
file_reader do |line, index|
|
96
|
-
lineno += 1
|
97
|
-
next if lineno_of_last_line_parsed && lineno <= lineno_of_last_line_parsed
|
98
|
-
|
99
|
-
path_match = line[@path_regexp, 1]
|
100
|
-
next if path_match.nil?
|
101
|
-
date_match = line[@date_regexp, 1]
|
102
|
-
next if date_match.nil?
|
103
|
-
datetime_obj = parse_string_as_date(date_match)
|
104
|
-
|
105
|
-
next if lineno_of_last_line_parsed.nil? && !last_pageview_at.nil? && datetime_obj < last_pageview_at
|
106
|
-
|
107
|
-
insert_or_increment(datetime_obj, path_match, SkinnyJeansStringParser.extract_search_query(line))
|
108
|
-
@last_pageview_at = datetime_obj
|
109
|
-
last_line_parsed = line.to_s[0..254] # only 255 characters because we store it in the database
|
110
|
-
lines_parsed += 1
|
111
|
-
end
|
112
|
-
end
|
113
|
-
|
114
|
-
puts "completed parsing in #{realtime}"
|
115
|
-
|
116
|
-
persisted = 0
|
117
|
-
persisted_pageview_keywords = 0
|
118
|
-
realtime = Benchmark.realtime do
|
119
|
-
|
120
|
-
hash_of_dates.each do |date, hash_of_paths|
|
121
|
-
hash_of_paths.keys.each do |path|
|
122
|
-
pv = Pageview.find_or_create_by_date_and_path(date, path)
|
123
|
-
pv.pageview_count ||= 0
|
124
|
-
pv.pageview_count += hash_of_paths[path]
|
125
|
-
pv.save!
|
126
|
-
persisted += 1
|
40
|
+
if !Update.table_exists?
|
41
|
+
SkinnyJeanDb.connection.create_table(:updates) do |t|
|
42
|
+
t.column :last_pageview_at, :timestamp
|
43
|
+
t.column :lines_parsed, :integer
|
44
|
+
t.column :last_line_parsed, :string
|
127
45
|
end
|
128
46
|
end
|
129
47
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
pvk.save!
|
138
|
-
persisted_pageview_keywords += 1
|
139
|
-
end
|
48
|
+
# addition from 2010-12-06 to track search traffic specifically
|
49
|
+
if !PageviewKeyword.table_exists?
|
50
|
+
SkinnyJeanDb.connection.create_table(:pageview_keywords) do |t|
|
51
|
+
t.column :date, :date
|
52
|
+
t.column :path, :string
|
53
|
+
t.column :pageview_count, :integer
|
54
|
+
t.column :keyword, :string
|
140
55
|
end
|
56
|
+
SkinnyJeanDb.connection.add_index(:pageview_keywords, [:date, :path, :keyword], :name => "date_path_keyword_index")
|
57
|
+
# SkinnyJeanDb.connection.add_index(:pageview_keywords, [:date, :pageview_count], :name => "date_pageview_count_index")
|
141
58
|
end
|
142
59
|
|
143
60
|
end
|
144
61
|
|
145
|
-
puts "completed persistence in #{realtime}"
|
146
|
-
|
147
|
-
Update.create!({:last_pageview_at => self.last_pageview_at, :lines_parsed => lines_parsed, :last_line_parsed => last_line_parsed.to_s[0..254]})
|
148
|
-
|
149
|
-
puts("total records in DB: #{Pageview.count}
|
150
|
-
lines parsed this round: #{lines_parsed}
|
151
|
-
lines persisted this round:#{persisted}
|
152
|
-
total SkinnyJeans executions since inception: #{Update.count}")
|
153
|
-
|
154
|
-
return self
|
155
|
-
|
156
|
-
end
|
157
|
-
|
158
|
-
# copies the log file, reads it, then removes it
|
159
|
-
def file_reader
|
160
|
-
|
161
|
-
temp_file_path = "#{@logfile_path}.copy"
|
162
|
-
temp_file = FileUtils.cp(@logfile_path, temp_file_path)
|
163
|
-
|
164
|
-
if @is_gzipped
|
165
|
-
lineno = 0
|
166
|
-
Zlib::GzipReader.new(File.new(temp_file_path, "r")).each_line{|line|yield([line,lineno]);lineno+=1}
|
167
|
-
# Zlib::GzipReader.open(@logfile_path).each_line{|line|yield([line,lineno]);lineno+=1}
|
168
|
-
else
|
169
|
-
File.new(temp_file_path, "r").each_with_index{|line, lineno| yield([line,lineno])}
|
170
|
-
end
|
171
|
-
|
172
|
-
FileUtils.rm_f(temp_file_path)
|
173
|
-
end
|
174
|
-
|
175
|
-
def pageview;get_ar_class(Pageview);end
|
176
|
-
def update;get_ar_class(Update);end
|
177
|
-
def pageview_keyword;get_ar_class(PageviewKeyword);end
|
178
|
-
|
179
|
-
def get_ar_class(klass)
|
180
|
-
begin;return(klass);rescue(ActiveRecord::ConnectionNotEstablished);prepare_db;end
|
181
|
-
end
|
182
|
-
|
183
|
-
private
|
184
|
-
|
185
|
-
# return a ruby Time object
|
186
|
-
def parse_string_as_date(date_string = "02/Oct/2010:11:17:44 -0700")
|
187
|
-
day,month,year,hour,minute,seconds,zone = date_string.scan(/(\d{1,2})\/(\w{3,5})\/(\d{4}):(\d\d):(\d\d):(\d\d)\s(-?\d{3,4})/).flatten
|
188
|
-
Time.parse("#{year}-#{month}-#{day} #{hour}:#{minute}:#{seconds} #{zone}")
|
189
|
-
end
|
190
|
-
|
191
|
-
def insert_or_increment(_datetime_obj, _path, _search_keyword = nil)
|
192
|
-
|
193
|
-
date_string = _datetime_obj.strftime(("%Y-%m-%d"))
|
194
|
-
|
195
|
-
# data for all pageviews
|
196
|
-
hash_of_dates[date_string] ||= {}
|
197
|
-
hash_of_dates[date_string][_path] ||= 0
|
198
|
-
hash_of_dates[date_string][_path] += 1
|
199
|
-
|
200
|
-
return if _search_keyword.nil?
|
201
|
-
|
202
|
-
# data for just pageviews coming from search
|
203
|
-
hash_of_dates_for_keywords[date_string] ||= {}
|
204
|
-
hash_of_dates_for_keywords[date_string][_path] ||= {}
|
205
|
-
hash_of_dates_for_keywords[date_string][_path][_search_keyword] ||= 0
|
206
|
-
hash_of_dates_for_keywords[date_string][_path][_search_keyword] += 1
|
207
|
-
|
208
|
-
end
|
209
|
-
|
210
|
-
class Pageview < SkinnyJeanDb
|
211
|
-
end
|
212
|
-
class PageviewKeyword < SkinnyJeanDb
|
213
|
-
end
|
214
|
-
class Update < SkinnyJeanDb
|
215
62
|
end
|
216
63
|
|
217
64
|
end
|
218
65
|
|
219
|
-
# SkinnyJeans::execute(ARGV.first) if "#{$0}".gsub(/.*\//,"") == "skinny_jeans.rb"
|
data/skinny_jeans.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{skinny_jeans}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.6.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Jonathan Otto"]
|
12
|
-
s.date = %q{
|
12
|
+
s.date = %q{2011-01-05}
|
13
13
|
s.email = %q{jonathan.otto@gmail.com}
|
14
14
|
s.extra_rdoc_files = [
|
15
15
|
"README.rdoc",
|
@@ -22,7 +22,8 @@ Gem::Specification.new do |s|
|
|
22
22
|
"TODO",
|
23
23
|
"VERSION",
|
24
24
|
"lib/skinny_jeans.rb",
|
25
|
-
"lib/
|
25
|
+
"lib/skinny_jeans/log_parser.rb",
|
26
|
+
"lib/skinny_jeans/string_parser.rb",
|
26
27
|
"skinny_jeans.gemspec"
|
27
28
|
]
|
28
29
|
s.homepage = %q{http://github.com/jotto/skinny_jeans}
|
@@ -1,15 +1,16 @@
|
|
1
|
-
require File.dirname(__FILE__) + '/../lib/skinny_jeans'
|
1
|
+
# require File.dirname(__FILE__) + '/../lib/skinny_jeans'
|
2
|
+
require File.expand_path(File.dirname(__FILE__) + "/../lib/skinny_jeans")
|
2
3
|
require 'test/unit'
|
3
4
|
require 'pp'
|
4
|
-
|
5
|
-
class
|
5
|
+
|
6
|
+
class SkinnyJeans::StringParserTest < Test::Unit::TestCase
|
6
7
|
|
7
8
|
|
8
9
|
def test_can_get_all_urls_from_string
|
9
10
|
_string=<<-EOF
|
10
11
|
98.244.200.209 - - [01/Dec/2010:11:51:26 -0800] "GET /deals/apple-ipod-touch HTTP/1.1" 200 11448 "http://www.google.com/m/search?oe=UTF-8&client=safari&hl=en&q=best+deals+for+the+4th+generation+iPod+touch+32+gb&gws_link_params=spell:1&ei=aqb2TJDBLqGutgfp862NAg&ved=0CBEQBSgA" "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_1 like Mac OS X; en-us) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8B117 Safari/6531.22.7" "-"
|
11
12
|
EOF
|
12
|
-
sjsp =
|
13
|
+
sjsp = SkinnyJeans::StringParser.new(_string)
|
13
14
|
assert_equal 1, sjsp.all_urls.size
|
14
15
|
assert_equal "http://www.google.com/m/search?oe=UTF-8&client=safari&hl=en&q=best+deals+for+the+4th+generation+iPod+touch+32+gb&gws_link_params=spell:1&ei=aqb2TJDBLqGutgfp862NAg&ved=0CBEQBSgA",
|
15
16
|
sjsp.all_urls.first
|
@@ -19,7 +20,7 @@ class SkinnyJeansStringParserTest < Test::Unit::TestCase
|
|
19
20
|
_string=<<-EOF
|
20
21
|
http://www.bing.com/search?q=XPS+17+coupon&src={referrer:source?}
|
21
22
|
EOF
|
22
|
-
sjsp =
|
23
|
+
sjsp = SkinnyJeans::StringParser.new(_string)
|
23
24
|
assert_equal "xps 17 coupon", sjsp.get_search_keyword
|
24
25
|
end
|
25
26
|
|
@@ -27,7 +28,7 @@ class SkinnyJeansStringParserTest < Test::Unit::TestCase
|
|
27
28
|
_string=<<-EOF
|
28
29
|
207.46.12.204 - - [01/Dec/2010:11:48:00 -0800] "GET /deals/skullcandy-inkd-earbuds HTTP/1.1" 200 5732 "-" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2; SLCC1; .NET CLR 1.1.4322; .NET CLR 2.0.40607; .NET CLR 3.0.04506.648)" "-"
|
29
30
|
EOF
|
30
|
-
sjsp =
|
31
|
+
sjsp = SkinnyJeans::StringParser.new(_string)
|
31
32
|
assert_nil sjsp.all_urls
|
32
33
|
|
33
34
|
assert_nil sjsp.get_search_keyword
|
@@ -37,8 +38,27 @@ class SkinnyJeansStringParserTest < Test::Unit::TestCase
|
|
37
38
|
_string=<<-EOF
|
38
39
|
98.244.200.209 - - [01/Dec/2010:11:51:26 -0800] "GET /deals/apple-ipod-touch HTTP/1.1" 200 11448 "http://www.google.com/m/search?oe=UTF-8&client=safari&hl=en&q=best+deals+for+the+4th+generation+iPod+touch+32+gb&gws_link_params=spell:1&ei=aqb2TJDBLqGutgfp862NAg&ved=0CBEQBSgA" "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_1 like Mac OS X; en-us) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8B117 Safari/6531.22.7" "-"
|
39
40
|
EOF
|
40
|
-
sjsp =
|
41
|
+
sjsp = SkinnyJeans::StringParser.new(_string)
|
41
42
|
assert_equal "best deals for the 4th generation ipod touch 32 gb", sjsp.get_search_keyword
|
42
43
|
end
|
43
44
|
|
45
|
+
def test_return_param_from_url
|
46
|
+
assert_equal "nowai", SkinnyJeans::StringParser.return_param_from_valid_url_or_path("/deals/hp-dv6-laptop?ok=yes&yea=true&drc=nowai","drc")
|
47
|
+
assert_equal "iDesign Tower Stereo System".downcase, SkinnyJeans::StringParser.return_param_from_valid_url_or_path("http://us.yhs.search.yahoo.com/if?partnerid=yhs-if-freecause&fr=yhs-if-freecause&ei=UTF-8&YST_b=21&tid=61613&uid=47727219&p=iDesign Tower Stereo System","p")
|
48
|
+
assert_equal "yes", SkinnyJeans::StringParser.return_param_from_valid_url_or_path("/deals/hp-dv6-laptop?ok=yes&yea=true&drc=nowai","ok")
|
49
|
+
assert_equal "yes", SkinnyJeans::StringParser.return_param_from_valid_url_or_path("http://dealzon.com/deals/hp-dv6-laptop?ok=yes&yea=true&drc=nowai","ok")
|
50
|
+
assert_equal "nowai", SkinnyJeans::StringParser.return_param_from_valid_url_or_path("http://dealzon.com/deals/hp-dv6-laptop?ok=yes&yea=true&drc=nowai","drc")
|
51
|
+
|
52
|
+
|
53
|
+
assert_equal "hp coupon", SkinnyJeans::StringParser.return_param_from_valid_url_or_path("http://search.yahoo.com/search;_ylt=AqnIgbSoqn0rhe69ABoUdv.bvZx4?p=hp+coupon&toggle=1&cop=mss&ei=UTF-8&fr=yfp-t-701","p")
|
54
|
+
assert_equal "hp coupon", SkinnyJeans::StringParser.return_param_from_valid_url_or_path("http://search.yahoo.com/search;_ylt=A0oG7h6HJPtMlGIBsFyl87UF;_ylc=X1MDMjE0MjQ3ODk0OARfcgMyBGZyA3NmcARuX2dwcwMxMARvcmlnaW4Dc3ljBHF1ZXJ5A2hwIGNvdXBvbgRzYW8DMQ--?p=hp+coupon&fr=sfp&fr2=&iscqry=","p")
|
55
|
+
assert_equal "hp coupon", SkinnyJeans::StringParser.return_param_from_valid_url_or_path("http://www.ask.com/web?q=hp+coupon&search=&qsrc=0&o=0&l=dir","q")
|
56
|
+
assert_equal "hp coupon", SkinnyJeans::StringParser.return_param_from_valid_url_or_path("http://search.aol.com/aol/search?enabled_terms=&s_it=comsearch50&q=hp+coupon","q")
|
57
|
+
assert_equal "hp coupns", SkinnyJeans::StringParser.return_param_from_valid_url_or_path("http://www.bing.com/search?q=hp+coupns&go=&form=QBLH&qs=n&sk=&sc=8-6","q")
|
58
|
+
assert_equal "dv6 coupon", SkinnyJeans::StringParser.return_param_from_valid_url_or_path("http://www.google.com/url?sa=t&source=web&cd=1&ved=0CCEQFjAA&url=http%3A%2F%2Fdealzon.com%2Fdeals%2Fhp-dv6-laptop&rct=j&q=dv6%20coupon&ei=1Pz7TKb5AoP7lwfEoeSgBQ&usg=AFQjCNGFVL4PvZ59hbxkCFVdmHMayEe3UQ&sig2=Vs7s9a1z2Elm23NVffMJ8A","q")
|
59
|
+
assert_equal "dv6 coupon", SkinnyJeans::StringParser.return_param_from_valid_url_or_path("http://www.google.com/url?sa=t&source=web&cd=1&sqi=2&ved=0CCAQFjAA&url=http%3A%2F%2Fdealzon.com%2Fdeals%2Fhp-dv6-laptop&rct=j&q=dv6%20coupon&ei=xyT7TO7kEYGKlwfT9tSPDA&usg=AFQjCNGFVL4PvZ59hbxkCFVdmHMayEe3UQ&sig2=7pnUG3YTE8sHONsIqLl1sg","q")
|
60
|
+
|
61
|
+
assert_equal "dv6 coupon", SkinnyJeans::StringParser.return_param_from_valid_url_or_path("http://www.google.com/url?sa=t&source=web&cd=1&sqi=2&ved=0CCAQFjAA&url=http%3A%2F%2Fdealzon.com%2Fdeals%2Fhp-dv6-laptop&rct=j&q=dv6%20coupon%20&ei=xyT7TO7kEYGKlwfT9tSPDA&usg=AFQjCNGFVL4PvZ59hbxkCFVdmHMayEe3UQ&sig2=7pnUG3YTE8sHONsIqLl1sg","q")
|
62
|
+
end
|
63
|
+
|
44
64
|
end
|
data/test/skinny_jeans_test.rb
CHANGED
@@ -1,14 +1,17 @@
|
|
1
|
-
|
1
|
+
|
2
|
+
require File.expand_path(File.dirname(__FILE__) + "/../lib/skinny_jeans")
|
2
3
|
require 'test/unit'
|
3
4
|
require 'pp'
|
4
|
-
|
5
|
+
|
5
6
|
class SkinnyJeansTest < Test::Unit::TestCase
|
6
7
|
|
7
8
|
|
8
9
|
def test_parse_pick_up_where_left_off
|
9
|
-
db_path = "
|
10
|
+
db_path = File.expand_path(File.dirname(__FILE__) + "/skinny_jeans_test.db")
|
11
|
+
# db_path = "./skinny_jeans_test.db"
|
10
12
|
FileUtils.rm(db_path) if File.exists?(db_path)
|
11
|
-
|
13
|
+
_logfile_path = File.expand_path(File.dirname(__FILE__) + "/small_access_log.log")
|
14
|
+
sj=SkinnyJeans::LogParser.new(_logfile_path, sqlite_skinny_jeans = db_path, path_regexp = /\s\/deals\/(.*)\sHTTP/, date_regexp = /\[(\d.*\d)\]/)
|
12
15
|
sj.execute
|
13
16
|
assert_equal 20, sj.pageview.count
|
14
17
|
assert_equal 2, sj.pageview.find_by_path("flip-video").pageview_count
|
@@ -20,7 +23,8 @@ class SkinnyJeansTest < Test::Unit::TestCase
|
|
20
23
|
#
|
21
24
|
# "
|
22
25
|
# the 2nd file is the same, but with 2 additional lines for flip-video and apple-ipod-touch
|
23
|
-
|
26
|
+
_logfile_path_2 = File.expand_path(File.dirname(__FILE__) + "/small_access_log_part_2.log")
|
27
|
+
sj=SkinnyJeans::LogParser.new(_logfile_path_2, sqlite_skinny_jeans = db_path, path_regexp = /\s\/deals\/(.*)\sHTTP/, date_regexp = /\[(\d.*\d)\]/)
|
24
28
|
sj.execute
|
25
29
|
assert_equal 3, sj.pageview.find_by_path("flip-video").pageview_count
|
26
30
|
assert_equal 2, sj.pageview.find_by_path("apple-ipod-touch").pageview_count
|
@@ -39,8 +43,9 @@ class SkinnyJeansTest < Test::Unit::TestCase
|
|
39
43
|
#
|
40
44
|
# "
|
41
45
|
|
46
|
+
_logfile_path_3 = File.expand_path(File.dirname(__FILE__) + "/small_access_log_part_3.log")
|
42
47
|
# the 3rd has 1 additional line so we can ensure we can leave off on a line over 255 characters
|
43
|
-
sj=SkinnyJeans.new(
|
48
|
+
sj=SkinnyJeans::LogParser.new(_logfile_path_3, sqlite_skinny_jeans = db_path, path_regexp = /\s\/deals\/(.*)\sHTTP/, date_regexp = /\[(\d.*\d)\]/)
|
44
49
|
sj.execute
|
45
50
|
assert_equal 3, sj.pageview.find_by_path("delonghi-hhp1500-safeheat-mica-panel-radiator-heater-with-thermostat-control").pageview_count
|
46
51
|
assert_equal 3, sj.pageview.find_by_path("apple-ipod-touch").pageview_count
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: skinny_jeans
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 7
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 0.
|
8
|
+
- 6
|
9
|
+
- 0
|
10
|
+
version: 0.6.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jonathan Otto
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2011-01-05 00:00:00 -06:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -66,7 +66,8 @@ files:
|
|
66
66
|
- TODO
|
67
67
|
- VERSION
|
68
68
|
- lib/skinny_jeans.rb
|
69
|
-
- lib/
|
69
|
+
- lib/skinny_jeans/log_parser.rb
|
70
|
+
- lib/skinny_jeans/string_parser.rb
|
70
71
|
- skinny_jeans.gemspec
|
71
72
|
- test/skinny_jeans_string_parser_test.rb
|
72
73
|
- test/skinny_jeans_test.rb
|
@@ -1,70 +0,0 @@
|
|
1
|
-
# example
|
2
|
-
# SkinnyJeansStringParser.extract_search_query("http://search.aol.com/aol/search?enabled_terms=&s_it=comsearch50&q=cool+stuff")
|
3
|
-
# => "cool stuff"
|
4
|
-
|
5
|
-
class SkinnyJeansStringParser
|
6
|
-
|
7
|
-
def self.extract_search_query(_url)
|
8
|
-
self.new(_url).get_search_keyword
|
9
|
-
end
|
10
|
-
|
11
|
-
attr_accessor :string_value
|
12
|
-
def initialize(string_value)
|
13
|
-
@string_value = string_value
|
14
|
-
end
|
15
|
-
|
16
|
-
# iterate through any URLs we find in a string and return a search query or nil
|
17
|
-
def get_search_keyword
|
18
|
-
!all_urls.nil? ? all_urls.collect { |_url| extract_search_query_from_url(_url) }[0] : nil
|
19
|
-
end
|
20
|
-
|
21
|
-
# pre: some referring URL from google, yahoo, AOL, bing, ask
|
22
|
-
# post: whatever the search query was, ASCII or GTFO
|
23
|
-
def extract_search_query_from_url(url)
|
24
|
-
val = nil
|
25
|
-
case url
|
26
|
-
when /google\.com/
|
27
|
-
val=return_param_from_url(url, "q")
|
28
|
-
when /search\.yahoo\.com/
|
29
|
-
val=return_param_from_url(url, "p")
|
30
|
-
when /search\.aol\.com/
|
31
|
-
val=return_param_from_url(url, "q")
|
32
|
-
when /ask\.com/
|
33
|
-
val=return_param_from_url(url, "q")
|
34
|
-
when /bing\.com/
|
35
|
-
val=return_param_from_url(url, "q")
|
36
|
-
end
|
37
|
-
# whitelist of acceptable characters
|
38
|
-
val = val.present? && val.gsub(/[^0-9A-Za-z\s"'!@#\$%\^&\*\(\)\?\<\>\[\]:;,\.+-_=]/, '') != val ? nil : val
|
39
|
-
return val
|
40
|
-
end
|
41
|
-
|
42
|
-
# pre: like http://example.org?q=cool&fun=no, "fun"
|
43
|
-
# post: "no"
|
44
|
-
def return_param_from_url(url, param_name)
|
45
|
-
_uri = URI.parse(URI.encode(url))
|
46
|
-
if _uri.query.present?
|
47
|
-
_cgi = CGI.parse(_uri.query)
|
48
|
-
if _cgi[param_name]
|
49
|
-
val = unescape_string(_cgi[param_name].to_s).strip.downcase
|
50
|
-
return (!val.nil? && val!='' ? val : nil)
|
51
|
-
end
|
52
|
-
end
|
53
|
-
return nil
|
54
|
-
end
|
55
|
-
|
56
|
-
# find all URLs in a string that are at beginning or end of string or are tokenized by spaces
|
57
|
-
def all_urls
|
58
|
-
@all_urls ||= string_value.split(/\s+/).reject { |_string| !_string.match(/^['"]?https?:['"]?/) }.collect { |url| url.gsub(/["']/,'') }
|
59
|
-
@all_urls.empty? ? nil : @all_urls
|
60
|
-
end
|
61
|
-
|
62
|
-
private
|
63
|
-
def unescape_string(_string)
|
64
|
-
temp = _string.dup
|
65
|
-
temp = CGI.unescape(temp) while CGI.unescape(temp) != temp
|
66
|
-
temp
|
67
|
-
end
|
68
|
-
|
69
|
-
|
70
|
-
end
|