skinny_jeans 0.5.2 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.5.2
1
+ 0.6.0
@@ -0,0 +1,164 @@
1
+ module SkinnyJeans
2
+
3
+ class LogParser
4
+
5
+ def self.execute(logfile_path, sqlite_db_path, path_regexp, date_regexp)
6
+ self.new(logfile_path, sqlite_db_path, path_regexp, date_regexp).execute
7
+ end
8
+
9
+ attr_accessor :hash_of_dates, :hash_of_dates_for_keywords, :last_pageview_at
10
+
11
+ def initialize(logfile_path, sqlite_db_path, path_regexp, date_regexp)
12
+ @logfile_path, @sqlite_db_path, @path_regexp, @date_regexp = [logfile_path, sqlite_db_path, path_regexp, date_regexp]
13
+ @is_gzipped = !logfile_path.to_s[/gz/].nil?
14
+ SkinnyJeans::prepare_db(@sqlite_db_path)
15
+ @hash_of_dates = {}
16
+ @hash_of_dates_for_keywords = {}
17
+ @last_datetime = nil
18
+ end
19
+
20
+
21
+ def execute
22
+
23
+ lines_parsed = 0
24
+ last_line_parsed, last_pageview_at, lineno_of_last_line_parsed = [nil,nil,nil]
25
+ # last_update = Update.order("id DESC").limit(1).first
26
+ last_update = Update.find(:first, :order => "id DESC", :limit => 1)
27
+
28
+ # see if the last_line_parsed parsed exists in the current log file
29
+ # if it doesnt exist, we'll simply read anything with a timestamp greater than last_pageview_at
30
+ if last_update
31
+ last_pageview_at, last_line_parsed = last_update.last_pageview_at, last_update.last_line_parsed
32
+ file_reader do |line, lineno|
33
+ if line.to_s[0..254] == last_line_parsed.to_s[0..254]
34
+ lineno_of_last_line_parsed = lineno
35
+ break
36
+ end
37
+ end
38
+ puts "last line parsed was\n#{last_line_parsed}\nat lineno #{lineno_of_last_line_parsed}"
39
+ end
40
+
41
+ realtime = Benchmark.realtime do
42
+ date_path_pairs_array = []
43
+ lineno = -1
44
+
45
+ file_reader do |line, index|
46
+ lineno += 1
47
+ next if lineno_of_last_line_parsed && lineno <= lineno_of_last_line_parsed
48
+
49
+ path_match = line[@path_regexp, 1]
50
+ next if path_match.nil?
51
+ date_match = line[@date_regexp, 1]
52
+ next if date_match.nil?
53
+ datetime_obj = parse_string_as_date(date_match)
54
+
55
+ next if lineno_of_last_line_parsed.nil? && !last_pageview_at.nil? && datetime_obj < last_pageview_at
56
+
57
+ insert_or_increment(datetime_obj, path_match, SkinnyJeans::StringParser.extract_search_query(line))
58
+ @last_pageview_at = datetime_obj
59
+ last_line_parsed = line.to_s[0..254] # only 255 characters because we store it in the database
60
+ lines_parsed += 1
61
+ end
62
+ end
63
+
64
+ puts "completed parsing in #{realtime}"
65
+
66
+ persisted = 0
67
+ persisted_pageview_keywords = 0
68
+ realtime = Benchmark.realtime do
69
+
70
+ hash_of_dates.each do |date, hash_of_paths|
71
+ hash_of_paths.keys.each do |path|
72
+ pv = Pageview.find_or_create_by_date_and_path(date, path)
73
+ pv.pageview_count ||= 0
74
+ pv.pageview_count += hash_of_paths[path]
75
+ pv.save!
76
+ persisted += 1
77
+ end
78
+ end
79
+
80
+ hash_of_dates_for_keywords.each do |date, hash_of_paths|
81
+ hash_of_paths.keys.each do |path|
82
+ hash_of_paths[path].keys.each do |keyword|
83
+ pvk = PageviewKeyword.find_or_create_by_date_and_path_and_keyword(date, path, keyword)
84
+ pvk.keyword = keyword.to_s[0..254]
85
+ pvk.pageview_count ||= 0
86
+ pvk.pageview_count += hash_of_paths[path][keyword]
87
+ pvk.save!
88
+ persisted_pageview_keywords += 1
89
+ end
90
+ end
91
+ end
92
+
93
+ end
94
+
95
+ puts "completed persistence in #{realtime}"
96
+
97
+ Update.create!({:last_pageview_at => self.last_pageview_at, :lines_parsed => lines_parsed, :last_line_parsed => last_line_parsed.to_s[0..254]})
98
+
99
+ puts("total records in DB: #{Pageview.count}
100
+ lines parsed this round: #{lines_parsed}
101
+ lines persisted this round:#{persisted}
102
+ total SkinnyJeans executions since inception: #{Update.count}")
103
+
104
+ return self
105
+
106
+ end
107
+
108
+ # copies the log file, reads it, then removes it
109
+ def file_reader
110
+
111
+ temp_file_path = "#{@logfile_path}.copy"
112
+ temp_file = FileUtils.cp(@logfile_path, temp_file_path)
113
+
114
+ if @is_gzipped
115
+ lineno = 0
116
+ Zlib::GzipReader.new(File.new(temp_file_path, "r")).each_line{|line|yield([line,lineno]);lineno+=1}
117
+ # Zlib::GzipReader.open(@logfile_path).each_line{|line|yield([line,lineno]);lineno+=1}
118
+ else
119
+ File.new(temp_file_path, "r").each_with_index{|line, lineno| yield([line,lineno])}
120
+ end
121
+
122
+ FileUtils.rm_f(temp_file_path)
123
+ end
124
+
125
+ def pageview;get_ar_class(Pageview);end
126
+ def update;get_ar_class(Update);end
127
+ def pageview_keyword;get_ar_class(PageviewKeyword);end
128
+
129
+ def get_ar_class(klass)
130
+ begin;return(klass);rescue(ActiveRecord::ConnectionNotEstablished);SkinnyJeans::prepare_db(@sqlite_db_path);end
131
+ end
132
+
133
+ private
134
+
135
+ # return a ruby Time object
136
+ def parse_string_as_date(date_string = "02/Oct/2010:11:17:44 -0700")
137
+ day,month,year,hour,minute,seconds,zone = date_string.scan(/(\d{1,2})\/(\w{3,5})\/(\d{4}):(\d\d):(\d\d):(\d\d)\s(-?\d{3,4})/).flatten
138
+ Time.parse("#{year}-#{month}-#{day} #{hour}:#{minute}:#{seconds} #{zone}")
139
+ end
140
+
141
+ def insert_or_increment(_datetime_obj, _path, _search_keyword = nil)
142
+
143
+ date_string = _datetime_obj.strftime(("%Y-%m-%d"))
144
+
145
+ # data for all pageviews
146
+ hash_of_dates[date_string] ||= {}
147
+ hash_of_dates[date_string][_path] ||= 0
148
+ hash_of_dates[date_string][_path] += 1
149
+
150
+ return if _search_keyword.nil?
151
+
152
+ # data for just pageviews coming from search
153
+ hash_of_dates_for_keywords[date_string] ||= {}
154
+ hash_of_dates_for_keywords[date_string][_path] ||= {}
155
+ hash_of_dates_for_keywords[date_string][_path][_search_keyword] ||= 0
156
+ hash_of_dates_for_keywords[date_string][_path][_search_keyword] += 1
157
+
158
+ end
159
+
160
+
161
+
162
+ end
163
+
164
+ end
@@ -0,0 +1,72 @@
1
+ # example
2
+ # SkinnyJeans::StringParser.extract_search_query("http://search.aol.com/aol/search?enabled_terms=&s_it=comsearch50&q=cool+stuff")
3
+ # => "cool stuff"
4
+
5
+ module SkinnyJeans
6
+ class StringParser
7
+
8
+ class << self
9
+ def extract_search_query(_url)
10
+ self.new(_url).get_search_keyword
11
+ end
12
+
13
+ # pre: some referring URL from google, yahoo, AOL, bing, ask
14
+ # post: whatever the search query was, ASCII or GTFO
15
+ def extract_search_query_from_valid_url(url)
16
+ val = nil
17
+ case url
18
+ when /google\.com/
19
+ val=return_param_from_valid_url_or_path(url,"q")
20
+ when /search\.yahoo\.com/
21
+ val=return_param_from_valid_url_or_path(url,"p")
22
+ when /search\.aol\.com/
23
+ val=return_param_from_valid_url_or_path(url,"q")
24
+ when /ask\.com/
25
+ val=return_param_from_valid_url_or_path(url,"q")
26
+ when /bing\.com/
27
+ val=return_param_from_valid_url_or_path(url,"q")
28
+ end
29
+ # whitelist of acceptable characters
30
+ val = val.present? && val.gsub(/[^0-9A-Za-z\s"'!@#\$%\^&\*\(\)\?\<\>\[\]:;,\.+-_=]/, '') != val ? nil : val
31
+ return val
32
+ end
33
+
34
+ # pre: like http://example.org?q=cool&fun=no, "fun"
35
+ # post: "no"
36
+ def return_param_from_valid_url_or_path(url_or_path, param_name)
37
+ _uri = URI.parse(URI.encode(url_or_path))
38
+ if _uri.query.present?
39
+ _cgi = CGI.parse(_uri.query)
40
+ if _cgi[param_name]
41
+ val = unescape_string(_cgi[param_name].join).strip.downcase
42
+ return (!val.nil? && val!='' ? val : nil)
43
+ end
44
+ end
45
+ return nil
46
+ end
47
+
48
+ def unescape_string(_string)
49
+ temp = _string.dup
50
+ temp = CGI.unescape(temp) while CGI.unescape(temp) != temp
51
+ temp
52
+ end
53
+ end
54
+
55
+ attr_accessor :string_value
56
+ def initialize(string_value)
57
+ @string_value = string_value
58
+ end
59
+
60
+ # find all URLs in a string that are at beginning or end of string or are tokenized by spaces
61
+ def all_urls
62
+ @all_urls ||= string_value.split(/\s+/).reject { |_string| !_string.match(/^['"]?https?:['"]?/) }.collect { |url| url.gsub(/["']/,'') }
63
+ @all_urls.empty? ? nil : @all_urls
64
+ end
65
+
66
+ # iterate through any URLs we find in a string and return a search query or nil
67
+ def get_search_keyword
68
+ !all_urls.nil? ? all_urls.collect { |_url| self.class.extract_search_query_from_valid_url(_url) }[0] : nil
69
+ end
70
+
71
+ end
72
+ end
data/lib/skinny_jeans.rb CHANGED
@@ -1,219 +1,65 @@
1
1
  require 'time'
2
2
  require 'benchmark'
3
- require 'rubygems'
4
- require 'sqlite3'
5
- require 'active_record'
6
3
  require 'zlib'
7
4
  require 'fileutils'
8
5
  require 'uri'
9
6
  require 'cgi'
10
- require 'skinny_jeans_string_parser'
7
+ require 'rubygems'
8
+ require 'active_record'
9
+ require 'sqlite3'
10
+ require File.expand_path(File.dirname(__FILE__) + "/skinny_jeans/string_parser")
11
+ require File.expand_path(File.dirname(__FILE__) + "/skinny_jeans/log_parser")
11
12
  # require 'home_run'
12
13
 
13
- class SkinnyJeans
14
-
15
- def self.execute(logfile_path, sqlite_db_path, path_regexp, date_regexp)
16
- self.new(logfile_path, sqlite_db_path, path_regexp, date_regexp).execute
17
- end
18
-
19
- attr_accessor :hash_of_dates, :hash_of_dates_for_keywords, :last_pageview_at
20
-
21
- def initialize(logfile_path, sqlite_db_path, path_regexp, date_regexp)
22
- @logfile_path, @sqlite_db_path, @path_regexp, @date_regexp = [logfile_path, sqlite_db_path, path_regexp, date_regexp]
23
- @is_gzipped = !logfile_path.to_s[/gz/].nil?
24
- prepare_db
25
- @hash_of_dates = {}
26
- @hash_of_dates_for_keywords = {}
27
- @last_datetime = nil
28
- end
29
14
 
15
+ # SkinnyJeans::execute(ARGV.first) if "#{$0}".gsub(/.*\//,"") == "skinny_jeans.rb"
16
+ module SkinnyJeans
30
17
  class SkinnyJeanDb < ActiveRecord::Base
31
18
  self.abstract_class = true
32
19
  end
33
-
34
- def prepare_db
35
- # create database if necessary
36
- SQLite3::Database.new(@sqlite_db_path)
37
- SkinnyJeanDb.establish_connection(:adapter => 'sqlite3', :database => @sqlite_db_path)
38
- # create tables if necessary
39
- if !Pageview.table_exists?
40
- SkinnyJeanDb.connection.create_table(:pageviews) do |t|
41
- t.column :date, :date
42
- t.column :path, :string
43
- t.column :pageview_count, :integer
44
- end
45
- # flow tight like skinny jeans with these compound indexes
46
- SkinnyJeanDb.connection.add_index(:pageviews, [:date, :path], :name => "date_path_index")
47
- SkinnyJeanDb.connection.add_index(:pageviews, [:date, :pageview_count], :name => "date_pageview_count_index")
48
- end
49
- if !Update.table_exists?
50
- SkinnyJeanDb.connection.create_table(:updates) do |t|
51
- t.column :last_pageview_at, :timestamp
52
- t.column :lines_parsed, :integer
53
- t.column :last_line_parsed, :string
54
- end
55
- end
56
-
57
- # addition from 2010-12-06 to track search traffic specifically
58
- if !PageviewKeyword.table_exists?
59
- SkinnyJeanDb.connection.create_table(:pageview_keywords) do |t|
60
- t.column :date, :date
61
- t.column :path, :string
62
- t.column :pageview_count, :integer
63
- t.column :keyword, :string
64
- end
65
- SkinnyJeanDb.connection.add_index(:pageview_keywords, [:date, :path, :keyword], :name => "date_path_keyword_index")
66
- # SkinnyJeanDb.connection.add_index(:pageview_keywords, [:date, :pageview_count], :name => "date_pageview_count_index")
67
- end
68
-
69
- end
70
-
71
- def execute
72
-
73
- lines_parsed = 0
74
- last_line_parsed, last_pageview_at, lineno_of_last_line_parsed = [nil,nil,nil]
75
- # last_update = Update.order("id DESC").limit(1).first
76
- last_update = Update.find(:first, :order => "id DESC", :limit => 1)
77
-
78
- # see if the last_line_parsed parsed exists in the current log file
79
- # if it doesnt exist, we'll simply read anything with a timestamp greater than last_pageview_at
80
- if last_update
81
- last_pageview_at, last_line_parsed = last_update.last_pageview_at, last_update.last_line_parsed
82
- file_reader do |line, lineno|
83
- if line.to_s[0..254] == last_line_parsed.to_s[0..254]
84
- lineno_of_last_line_parsed = lineno
85
- break
20
+ class Pageview < SkinnyJeanDb;end
21
+ class PageviewKeyword < SkinnyJeanDb;end
22
+ class Update < SkinnyJeanDb;end
23
+
24
+ class << self
25
+ def prepare_db(sqlite_db_path)
26
+ # create database if necessary
27
+ SQLite3::Database.new(sqlite_db_path)
28
+ SkinnyJeanDb.establish_connection(:adapter => 'sqlite3', :database => sqlite_db_path)
29
+ # create tables if necessary
30
+ if !Pageview.table_exists?
31
+ SkinnyJeanDb.connection.create_table(:pageviews) do |t|
32
+ t.column :date, :date
33
+ t.column :path, :string
34
+ t.column :pageview_count, :integer
86
35
  end
36
+ # flow tight like skinny jeans with these compound indexes
37
+ SkinnyJeanDb.connection.add_index(:pageviews, [:date, :path], :name => "date_path_index")
38
+ SkinnyJeanDb.connection.add_index(:pageviews, [:date, :pageview_count], :name => "date_pageview_count_index")
87
39
  end
88
- puts "last line parsed was\n#{last_line_parsed}\nat lineno #{lineno_of_last_line_parsed}"
89
- end
90
-
91
- realtime = Benchmark.realtime do
92
- date_path_pairs_array = []
93
- lineno = -1
94
-
95
- file_reader do |line, index|
96
- lineno += 1
97
- next if lineno_of_last_line_parsed && lineno <= lineno_of_last_line_parsed
98
-
99
- path_match = line[@path_regexp, 1]
100
- next if path_match.nil?
101
- date_match = line[@date_regexp, 1]
102
- next if date_match.nil?
103
- datetime_obj = parse_string_as_date(date_match)
104
-
105
- next if lineno_of_last_line_parsed.nil? && !last_pageview_at.nil? && datetime_obj < last_pageview_at
106
-
107
- insert_or_increment(datetime_obj, path_match, SkinnyJeansStringParser.extract_search_query(line))
108
- @last_pageview_at = datetime_obj
109
- last_line_parsed = line.to_s[0..254] # only 255 characters because we store it in the database
110
- lines_parsed += 1
111
- end
112
- end
113
-
114
- puts "completed parsing in #{realtime}"
115
-
116
- persisted = 0
117
- persisted_pageview_keywords = 0
118
- realtime = Benchmark.realtime do
119
-
120
- hash_of_dates.each do |date, hash_of_paths|
121
- hash_of_paths.keys.each do |path|
122
- pv = Pageview.find_or_create_by_date_and_path(date, path)
123
- pv.pageview_count ||= 0
124
- pv.pageview_count += hash_of_paths[path]
125
- pv.save!
126
- persisted += 1
40
+ if !Update.table_exists?
41
+ SkinnyJeanDb.connection.create_table(:updates) do |t|
42
+ t.column :last_pageview_at, :timestamp
43
+ t.column :lines_parsed, :integer
44
+ t.column :last_line_parsed, :string
127
45
  end
128
46
  end
129
47
 
130
- hash_of_dates_for_keywords.each do |date, hash_of_paths|
131
- hash_of_paths.keys.each do |path|
132
- hash_of_paths[path].keys.each do |keyword|
133
- pvk = PageviewKeyword.find_or_create_by_date_and_path_and_keyword(date, path, keyword)
134
- pvk.keyword = keyword.to_s[0..254]
135
- pvk.pageview_count ||= 0
136
- pvk.pageview_count += hash_of_paths[path][keyword]
137
- pvk.save!
138
- persisted_pageview_keywords += 1
139
- end
48
+ # addition from 2010-12-06 to track search traffic specifically
49
+ if !PageviewKeyword.table_exists?
50
+ SkinnyJeanDb.connection.create_table(:pageview_keywords) do |t|
51
+ t.column :date, :date
52
+ t.column :path, :string
53
+ t.column :pageview_count, :integer
54
+ t.column :keyword, :string
140
55
  end
56
+ SkinnyJeanDb.connection.add_index(:pageview_keywords, [:date, :path, :keyword], :name => "date_path_keyword_index")
57
+ # SkinnyJeanDb.connection.add_index(:pageview_keywords, [:date, :pageview_count], :name => "date_pageview_count_index")
141
58
  end
142
59
 
143
60
  end
144
61
 
145
- puts "completed persistence in #{realtime}"
146
-
147
- Update.create!({:last_pageview_at => self.last_pageview_at, :lines_parsed => lines_parsed, :last_line_parsed => last_line_parsed.to_s[0..254]})
148
-
149
- puts("total records in DB: #{Pageview.count}
150
- lines parsed this round: #{lines_parsed}
151
- lines persisted this round:#{persisted}
152
- total SkinnyJeans executions since inception: #{Update.count}")
153
-
154
- return self
155
-
156
- end
157
-
158
- # copies the log file, reads it, then removes it
159
- def file_reader
160
-
161
- temp_file_path = "#{@logfile_path}.copy"
162
- temp_file = FileUtils.cp(@logfile_path, temp_file_path)
163
-
164
- if @is_gzipped
165
- lineno = 0
166
- Zlib::GzipReader.new(File.new(temp_file_path, "r")).each_line{|line|yield([line,lineno]);lineno+=1}
167
- # Zlib::GzipReader.open(@logfile_path).each_line{|line|yield([line,lineno]);lineno+=1}
168
- else
169
- File.new(temp_file_path, "r").each_with_index{|line, lineno| yield([line,lineno])}
170
- end
171
-
172
- FileUtils.rm_f(temp_file_path)
173
- end
174
-
175
- def pageview;get_ar_class(Pageview);end
176
- def update;get_ar_class(Update);end
177
- def pageview_keyword;get_ar_class(PageviewKeyword);end
178
-
179
- def get_ar_class(klass)
180
- begin;return(klass);rescue(ActiveRecord::ConnectionNotEstablished);prepare_db;end
181
- end
182
-
183
- private
184
-
185
- # return a ruby Time object
186
- def parse_string_as_date(date_string = "02/Oct/2010:11:17:44 -0700")
187
- day,month,year,hour,minute,seconds,zone = date_string.scan(/(\d{1,2})\/(\w{3,5})\/(\d{4}):(\d\d):(\d\d):(\d\d)\s(-?\d{3,4})/).flatten
188
- Time.parse("#{year}-#{month}-#{day} #{hour}:#{minute}:#{seconds} #{zone}")
189
- end
190
-
191
- def insert_or_increment(_datetime_obj, _path, _search_keyword = nil)
192
-
193
- date_string = _datetime_obj.strftime(("%Y-%m-%d"))
194
-
195
- # data for all pageviews
196
- hash_of_dates[date_string] ||= {}
197
- hash_of_dates[date_string][_path] ||= 0
198
- hash_of_dates[date_string][_path] += 1
199
-
200
- return if _search_keyword.nil?
201
-
202
- # data for just pageviews coming from search
203
- hash_of_dates_for_keywords[date_string] ||= {}
204
- hash_of_dates_for_keywords[date_string][_path] ||= {}
205
- hash_of_dates_for_keywords[date_string][_path][_search_keyword] ||= 0
206
- hash_of_dates_for_keywords[date_string][_path][_search_keyword] += 1
207
-
208
- end
209
-
210
- class Pageview < SkinnyJeanDb
211
- end
212
- class PageviewKeyword < SkinnyJeanDb
213
- end
214
- class Update < SkinnyJeanDb
215
62
  end
216
63
 
217
64
  end
218
65
 
219
- # SkinnyJeans::execute(ARGV.first) if "#{$0}".gsub(/.*\//,"") == "skinny_jeans.rb"
data/skinny_jeans.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{skinny_jeans}
8
- s.version = "0.5.2"
8
+ s.version = "0.6.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Jonathan Otto"]
12
- s.date = %q{2010-12-08}
12
+ s.date = %q{2011-01-05}
13
13
  s.email = %q{jonathan.otto@gmail.com}
14
14
  s.extra_rdoc_files = [
15
15
  "README.rdoc",
@@ -22,7 +22,8 @@ Gem::Specification.new do |s|
22
22
  "TODO",
23
23
  "VERSION",
24
24
  "lib/skinny_jeans.rb",
25
- "lib/skinny_jeans_string_parser.rb",
25
+ "lib/skinny_jeans/log_parser.rb",
26
+ "lib/skinny_jeans/string_parser.rb",
26
27
  "skinny_jeans.gemspec"
27
28
  ]
28
29
  s.homepage = %q{http://github.com/jotto/skinny_jeans}
@@ -1,15 +1,16 @@
1
- require File.dirname(__FILE__) + '/../lib/skinny_jeans'
1
+ # require File.dirname(__FILE__) + '/../lib/skinny_jeans'
2
+ require File.expand_path(File.dirname(__FILE__) + "/../lib/skinny_jeans")
2
3
  require 'test/unit'
3
4
  require 'pp'
4
- require 'fileutils'
5
- class SkinnyJeansStringParserTest < Test::Unit::TestCase
5
+
6
+ class SkinnyJeans::StringParserTest < Test::Unit::TestCase
6
7
 
7
8
 
8
9
  def test_can_get_all_urls_from_string
9
10
  _string=<<-EOF
10
11
  98.244.200.209 - - [01/Dec/2010:11:51:26 -0800] "GET /deals/apple-ipod-touch HTTP/1.1" 200 11448 "http://www.google.com/m/search?oe=UTF-8&client=safari&hl=en&q=best+deals+for+the+4th+generation+iPod+touch+32+gb&gws_link_params=spell:1&ei=aqb2TJDBLqGutgfp862NAg&ved=0CBEQBSgA" "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_1 like Mac OS X; en-us) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8B117 Safari/6531.22.7" "-"
11
12
  EOF
12
- sjsp = SkinnyJeansStringParser.new(_string)
13
+ sjsp = SkinnyJeans::StringParser.new(_string)
13
14
  assert_equal 1, sjsp.all_urls.size
14
15
  assert_equal "http://www.google.com/m/search?oe=UTF-8&client=safari&hl=en&q=best+deals+for+the+4th+generation+iPod+touch+32+gb&gws_link_params=spell:1&ei=aqb2TJDBLqGutgfp862NAg&ved=0CBEQBSgA",
15
16
  sjsp.all_urls.first
@@ -19,7 +20,7 @@ class SkinnyJeansStringParserTest < Test::Unit::TestCase
19
20
  _string=<<-EOF
20
21
  http://www.bing.com/search?q=XPS+17+coupon&src={referrer:source?}
21
22
  EOF
22
- sjsp = SkinnyJeansStringParser.new(_string)
23
+ sjsp = SkinnyJeans::StringParser.new(_string)
23
24
  assert_equal "xps 17 coupon", sjsp.get_search_keyword
24
25
  end
25
26
 
@@ -27,7 +28,7 @@ class SkinnyJeansStringParserTest < Test::Unit::TestCase
27
28
  _string=<<-EOF
28
29
  207.46.12.204 - - [01/Dec/2010:11:48:00 -0800] "GET /deals/skullcandy-inkd-earbuds HTTP/1.1" 200 5732 "-" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2; SLCC1; .NET CLR 1.1.4322; .NET CLR 2.0.40607; .NET CLR 3.0.04506.648)" "-"
29
30
  EOF
30
- sjsp = SkinnyJeansStringParser.new(_string)
31
+ sjsp = SkinnyJeans::StringParser.new(_string)
31
32
  assert_nil sjsp.all_urls
32
33
 
33
34
  assert_nil sjsp.get_search_keyword
@@ -37,8 +38,27 @@ class SkinnyJeansStringParserTest < Test::Unit::TestCase
37
38
  _string=<<-EOF
38
39
  98.244.200.209 - - [01/Dec/2010:11:51:26 -0800] "GET /deals/apple-ipod-touch HTTP/1.1" 200 11448 "http://www.google.com/m/search?oe=UTF-8&client=safari&hl=en&q=best+deals+for+the+4th+generation+iPod+touch+32+gb&gws_link_params=spell:1&ei=aqb2TJDBLqGutgfp862NAg&ved=0CBEQBSgA" "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_1 like Mac OS X; en-us) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8B117 Safari/6531.22.7" "-"
39
40
  EOF
40
- sjsp = SkinnyJeansStringParser.new(_string)
41
+ sjsp = SkinnyJeans::StringParser.new(_string)
41
42
  assert_equal "best deals for the 4th generation ipod touch 32 gb", sjsp.get_search_keyword
42
43
  end
43
44
 
45
+ def test_return_param_from_url
46
+ assert_equal "nowai", SkinnyJeans::StringParser.return_param_from_valid_url_or_path("/deals/hp-dv6-laptop?ok=yes&yea=true&drc=nowai","drc")
47
+ assert_equal "iDesign Tower Stereo System".downcase, SkinnyJeans::StringParser.return_param_from_valid_url_or_path("http://us.yhs.search.yahoo.com/if?partnerid=yhs-if-freecause&fr=yhs-if-freecause&ei=UTF-8&YST_b=21&tid=61613&uid=47727219&p=iDesign Tower Stereo System","p")
48
+ assert_equal "yes", SkinnyJeans::StringParser.return_param_from_valid_url_or_path("/deals/hp-dv6-laptop?ok=yes&yea=true&drc=nowai","ok")
49
+ assert_equal "yes", SkinnyJeans::StringParser.return_param_from_valid_url_or_path("http://dealzon.com/deals/hp-dv6-laptop?ok=yes&yea=true&drc=nowai","ok")
50
+ assert_equal "nowai", SkinnyJeans::StringParser.return_param_from_valid_url_or_path("http://dealzon.com/deals/hp-dv6-laptop?ok=yes&yea=true&drc=nowai","drc")
51
+
52
+
53
+ assert_equal "hp coupon", SkinnyJeans::StringParser.return_param_from_valid_url_or_path("http://search.yahoo.com/search;_ylt=AqnIgbSoqn0rhe69ABoUdv.bvZx4?p=hp+coupon&toggle=1&cop=mss&ei=UTF-8&fr=yfp-t-701","p")
54
+ assert_equal "hp coupon", SkinnyJeans::StringParser.return_param_from_valid_url_or_path("http://search.yahoo.com/search;_ylt=A0oG7h6HJPtMlGIBsFyl87UF;_ylc=X1MDMjE0MjQ3ODk0OARfcgMyBGZyA3NmcARuX2dwcwMxMARvcmlnaW4Dc3ljBHF1ZXJ5A2hwIGNvdXBvbgRzYW8DMQ--?p=hp+coupon&fr=sfp&fr2=&iscqry=","p")
55
+ assert_equal "hp coupon", SkinnyJeans::StringParser.return_param_from_valid_url_or_path("http://www.ask.com/web?q=hp+coupon&search=&qsrc=0&o=0&l=dir","q")
56
+ assert_equal "hp coupon", SkinnyJeans::StringParser.return_param_from_valid_url_or_path("http://search.aol.com/aol/search?enabled_terms=&s_it=comsearch50&q=hp+coupon","q")
57
+ assert_equal "hp coupns", SkinnyJeans::StringParser.return_param_from_valid_url_or_path("http://www.bing.com/search?q=hp+coupns&go=&form=QBLH&qs=n&sk=&sc=8-6","q")
58
+ assert_equal "dv6 coupon", SkinnyJeans::StringParser.return_param_from_valid_url_or_path("http://www.google.com/url?sa=t&source=web&cd=1&ved=0CCEQFjAA&url=http%3A%2F%2Fdealzon.com%2Fdeals%2Fhp-dv6-laptop&rct=j&q=dv6%20coupon&ei=1Pz7TKb5AoP7lwfEoeSgBQ&usg=AFQjCNGFVL4PvZ59hbxkCFVdmHMayEe3UQ&sig2=Vs7s9a1z2Elm23NVffMJ8A","q")
59
+ assert_equal "dv6 coupon", SkinnyJeans::StringParser.return_param_from_valid_url_or_path("http://www.google.com/url?sa=t&source=web&cd=1&sqi=2&ved=0CCAQFjAA&url=http%3A%2F%2Fdealzon.com%2Fdeals%2Fhp-dv6-laptop&rct=j&q=dv6%20coupon&ei=xyT7TO7kEYGKlwfT9tSPDA&usg=AFQjCNGFVL4PvZ59hbxkCFVdmHMayEe3UQ&sig2=7pnUG3YTE8sHONsIqLl1sg","q")
60
+
61
+ assert_equal "dv6 coupon", SkinnyJeans::StringParser.return_param_from_valid_url_or_path("http://www.google.com/url?sa=t&source=web&cd=1&sqi=2&ved=0CCAQFjAA&url=http%3A%2F%2Fdealzon.com%2Fdeals%2Fhp-dv6-laptop&rct=j&q=dv6%20coupon%20&ei=xyT7TO7kEYGKlwfT9tSPDA&usg=AFQjCNGFVL4PvZ59hbxkCFVdmHMayEe3UQ&sig2=7pnUG3YTE8sHONsIqLl1sg","q")
62
+ end
63
+
44
64
  end
@@ -1,14 +1,17 @@
1
- require File.dirname(__FILE__) + '/../lib/skinny_jeans'
1
+
2
+ require File.expand_path(File.dirname(__FILE__) + "/../lib/skinny_jeans")
2
3
  require 'test/unit'
3
4
  require 'pp'
4
- require 'fileutils'
5
+
5
6
  class SkinnyJeansTest < Test::Unit::TestCase
6
7
 
7
8
 
8
9
  def test_parse_pick_up_where_left_off
9
- db_path = "./skinny_jeans_test.db"
10
+ db_path = File.expand_path(File.dirname(__FILE__) + "/skinny_jeans_test.db")
11
+ # db_path = "./skinny_jeans_test.db"
10
12
  FileUtils.rm(db_path) if File.exists?(db_path)
11
- sj=SkinnyJeans.new(logfile_path = "small_access_log.log", sqlite_skinny_jeans = db_path, path_regexp = /\s\/deals\/(.*)\sHTTP/, date_regexp = /\[(\d.*\d)\]/)
13
+ _logfile_path = File.expand_path(File.dirname(__FILE__) + "/small_access_log.log")
14
+ sj=SkinnyJeans::LogParser.new(_logfile_path, sqlite_skinny_jeans = db_path, path_regexp = /\s\/deals\/(.*)\sHTTP/, date_regexp = /\[(\d.*\d)\]/)
12
15
  sj.execute
13
16
  assert_equal 20, sj.pageview.count
14
17
  assert_equal 2, sj.pageview.find_by_path("flip-video").pageview_count
@@ -20,7 +23,8 @@ class SkinnyJeansTest < Test::Unit::TestCase
20
23
  #
21
24
  # "
22
25
  # the 2nd file is the same, but with 2 additional lines for flip-video and apple-ipod-touch
23
- sj=SkinnyJeans.new(logfile_path = "small_access_log_part_2.log", sqlite_skinny_jeans = db_path, path_regexp = /\s\/deals\/(.*)\sHTTP/, date_regexp = /\[(\d.*\d)\]/)
26
+ _logfile_path_2 = File.expand_path(File.dirname(__FILE__) + "/small_access_log_part_2.log")
27
+ sj=SkinnyJeans::LogParser.new(_logfile_path_2, sqlite_skinny_jeans = db_path, path_regexp = /\s\/deals\/(.*)\sHTTP/, date_regexp = /\[(\d.*\d)\]/)
24
28
  sj.execute
25
29
  assert_equal 3, sj.pageview.find_by_path("flip-video").pageview_count
26
30
  assert_equal 2, sj.pageview.find_by_path("apple-ipod-touch").pageview_count
@@ -39,8 +43,9 @@ class SkinnyJeansTest < Test::Unit::TestCase
39
43
  #
40
44
  # "
41
45
 
46
+ _logfile_path_3 = File.expand_path(File.dirname(__FILE__) + "/small_access_log_part_3.log")
42
47
  # the 3rd has 1 additional line so we can ensure we can leave off on a line over 255 characters
43
- sj=SkinnyJeans.new(logfile_path = "small_access_log_part_3.log", sqlite_skinny_jeans = db_path, path_regexp = /\s\/deals\/(.*)\sHTTP/, date_regexp = /\[(\d.*\d)\]/)
48
+ sj=SkinnyJeans::LogParser.new(_logfile_path_3, sqlite_skinny_jeans = db_path, path_regexp = /\s\/deals\/(.*)\sHTTP/, date_regexp = /\[(\d.*\d)\]/)
44
49
  sj.execute
45
50
  assert_equal 3, sj.pageview.find_by_path("delonghi-hhp1500-safeheat-mica-panel-radiator-heater-with-thermostat-control").pageview_count
46
51
  assert_equal 3, sj.pageview.find_by_path("apple-ipod-touch").pageview_count
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: skinny_jeans
3
3
  version: !ruby/object:Gem::Version
4
- hash: 15
4
+ hash: 7
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 5
9
- - 2
10
- version: 0.5.2
8
+ - 6
9
+ - 0
10
+ version: 0.6.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jonathan Otto
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-12-08 00:00:00 -06:00
18
+ date: 2011-01-05 00:00:00 -06:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -66,7 +66,8 @@ files:
66
66
  - TODO
67
67
  - VERSION
68
68
  - lib/skinny_jeans.rb
69
- - lib/skinny_jeans_string_parser.rb
69
+ - lib/skinny_jeans/log_parser.rb
70
+ - lib/skinny_jeans/string_parser.rb
70
71
  - skinny_jeans.gemspec
71
72
  - test/skinny_jeans_string_parser_test.rb
72
73
  - test/skinny_jeans_test.rb
@@ -1,70 +0,0 @@
1
- # example
2
- # SkinnyJeansStringParser.extract_search_query("http://search.aol.com/aol/search?enabled_terms=&s_it=comsearch50&q=cool+stuff")
3
- # => "cool stuff"
4
-
5
- class SkinnyJeansStringParser
6
-
7
- def self.extract_search_query(_url)
8
- self.new(_url).get_search_keyword
9
- end
10
-
11
- attr_accessor :string_value
12
- def initialize(string_value)
13
- @string_value = string_value
14
- end
15
-
16
- # iterate through any URLs we find in a string and return a search query or nil
17
- def get_search_keyword
18
- !all_urls.nil? ? all_urls.collect { |_url| extract_search_query_from_url(_url) }[0] : nil
19
- end
20
-
21
- # pre: some referring URL from google, yahoo, AOL, bing, ask
22
- # post: whatever the search query was, ASCII or GTFO
23
- def extract_search_query_from_url(url)
24
- val = nil
25
- case url
26
- when /google\.com/
27
- val=return_param_from_url(url, "q")
28
- when /search\.yahoo\.com/
29
- val=return_param_from_url(url, "p")
30
- when /search\.aol\.com/
31
- val=return_param_from_url(url, "q")
32
- when /ask\.com/
33
- val=return_param_from_url(url, "q")
34
- when /bing\.com/
35
- val=return_param_from_url(url, "q")
36
- end
37
- # whitelist of acceptable characters
38
- val = val.present? && val.gsub(/[^0-9A-Za-z\s"'!@#\$%\^&\*\(\)\?\<\>\[\]:;,\.+-_=]/, '') != val ? nil : val
39
- return val
40
- end
41
-
42
- # pre: like http://example.org?q=cool&fun=no, "fun"
43
- # post: "no"
44
- def return_param_from_url(url, param_name)
45
- _uri = URI.parse(URI.encode(url))
46
- if _uri.query.present?
47
- _cgi = CGI.parse(_uri.query)
48
- if _cgi[param_name]
49
- val = unescape_string(_cgi[param_name].to_s).strip.downcase
50
- return (!val.nil? && val!='' ? val : nil)
51
- end
52
- end
53
- return nil
54
- end
55
-
56
- # find all URLs in a string that are at beginning or end of string or are tokenized by spaces
57
- def all_urls
58
- @all_urls ||= string_value.split(/\s+/).reject { |_string| !_string.match(/^['"]?https?:['"]?/) }.collect { |url| url.gsub(/["']/,'') }
59
- @all_urls.empty? ? nil : @all_urls
60
- end
61
-
62
- private
63
- def unescape_string(_string)
64
- temp = _string.dup
65
- temp = CGI.unescape(temp) while CGI.unescape(temp) != temp
66
- temp
67
- end
68
-
69
-
70
- end