web_analytics_discovery 2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,44 @@
1
+ require 'web_analytics_discovery/grabberutils'
2
+
3
+ module WebAnalyticsDiscovery
4
+ class Openstat
5
+ include GrabberUtils
6
+
7
+ BIG_SITE_ID = 601119
8
+
9
+ def run(url)
10
+ @page = download(url)
11
+ run_id(find_id)
12
+ end
13
+
14
+ def find_id
15
+ case @page
16
+ when /<span id="(?:openstat|spylog)(\d+)"><\/span>/
17
+ $1
18
+ when /<img src=["']?http:\/\/u([0-9.]+)\.spylog\.com\/cnt/
19
+ $1.gsub(/[.]/, '').to_i
20
+ else
21
+ nil
22
+ end
23
+ end
24
+
25
+ def run_id(id)
26
+ return nil unless id
27
+ r = {:id => id}
28
+ doc = download("http://rating.openstat.ru/site/#{id}")
29
+ r[:visitors_day], r[:visits_day], r[:pv_day] = grab(doc, 'osb-rating_site-e-table-col-m-day')
30
+ r[:visitors_mon], r[:visits_mon], r[:pv_mon] = grab(doc, 'osb-rating_site-e-table-col-m-month')
31
+ return r
32
+ end
33
+
34
+ def grab(doc, classname)
35
+ a = []
36
+ doc.gsub(/#{classname}">([^<]+)<\/td>/) {
37
+ r = $1
38
+ r.gsub!(/&nbsp;/, '')
39
+ a << r.to_i
40
+ }
41
+ return a
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,84 @@
1
+ # -*- coding: UTF-8 -*-
2
+
3
+ require 'cgi'
4
+ require 'uri'
5
+ require 'json'
6
+ require 'web_analytics_discovery/grabberutils'
7
+
8
+ module WebAnalyticsDiscovery
9
+ class Quantcast
10
+ include GrabberUtils
11
+
12
+ def run(url)
13
+ uri = URI.parse(url)
14
+ run_id(uri.host)
15
+ end
16
+
17
+ def run_id(host)
18
+ r = {}
19
+
20
+ # Get auth cookies
21
+ doc = download("https://www.quantcast.com/#{host}")
22
+ if doc =~ /<td class="reach" id="reach-(.*?)">/
23
+ r[:id] = id = $1
24
+ else
25
+ return nil
26
+ end
27
+
28
+ # Quantcast has no traffic info? We should stop here
29
+ return r if doc =~ /content="We do not have enough information to provide a traffic estimate./
30
+
31
+ return run_id_quantified(r, host) if doc =~ /<h4>Quantified<\/h4>/
32
+
33
+ # Use auth cookies with API call
34
+ d = traffic_api_call(host, id, 'US', 'DAY30')
35
+
36
+ # points = d['reach']['US']['PEOPLE']['WEB']
37
+ # points.each { |pt|
38
+ # puts Time.at(pt['timestamp']).to_s + "\t" + pt['reach'].to_s
39
+ # }
40
+
41
+ r[:visitors_mon] = d['summaries']['US']['PEOPLE']['WEB']['reach'].to_i
42
+
43
+ return r
44
+ end
45
+
46
+ # Parse more precise, direct statistics on a quantified site
47
+ def run_id_quantified(r, host)
48
+ d = traffic_api_call(host, r[:id], 'GLOBAL', 'DAY1')
49
+ r[:visitors_day] = avg_last_metric(d, 'UNIQUES', 7)
50
+ r[:pv_day] = avg_last_metric(d, 'PAGE_VIEWS', 7)
51
+ r[:visits_day]= avg_last_metric(d, 'VISITS', 7)
52
+
53
+ d = traffic_api_call(host, r[:id], 'GLOBAL', 'DAY7')
54
+ r[:visitors_week] = avg_last_metric(d, 'UNIQUES', 1)
55
+ r[:pv_week] = avg_last_metric(d, 'PAGE_VIEWS', 1)
56
+ r[:visits_week]= avg_last_metric(d, 'VISITS', 1)
57
+
58
+ d = traffic_api_call(host, r[:id], 'GLOBAL', 'DAY30')['summaries']['GLOBAL']
59
+ r[:visitors_mon] = d['UNIQUES']['WEB']['reach'].to_i
60
+ r[:pv_mon] = d['PAGE_VIEWS']['WEB']['reach'].to_i
61
+ r[:visits_mon] = d['VISITS']['WEB']['reach'].to_i
62
+
63
+ return r
64
+ end
65
+
66
+ def traffic_api_call(host, id, country, period)
67
+ id_encoded = CGI::escape(id)
68
+ doc = download(
69
+ "https://www.quantcast.com/api/profile/traffic/?&wUnit=#{id_encoded}&country=#{country}&period=#{period}&countType=",
70
+ 'UTF-8',
71
+ 'Referer' => "https://www.quantcast.com/#{host}?country=#{country}",
72
+ 'X-Requested-With' => 'XMLHttpRequest'
73
+ )
74
+ return JSON.load(doc).first
75
+ end
76
+
77
+ def avg_last_metric(d, metric, last_n)
78
+ sum = 0
79
+ d['reach']['GLOBAL'][metric]['ONLINE_WEB'][-last_n..-1].each { |pt| sum += pt['reach'] }
80
+ d['reach']['GLOBAL'][metric]['MOBILE_WEB'][-last_n..-1].each { |pt| sum += pt['reach'] }
81
+ return (sum / last_n).to_i
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,100 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ require 'web_analytics_discovery/grabberutils'
4
+
5
+ module WebAnalyticsDiscovery
6
+ class Rambler
7
+ include GrabberUtils
8
+
9
+ SEC_PER_DAY = 24 * 60 * 60
10
+
11
+ def run(url)
12
+ @page = download(url)
13
+ run_id(find_id)
14
+ end
15
+
16
+ def find_id
17
+ case @page
18
+ when /_top100q.push\(\["setAccount", "(\d+)"\]\)/,
19
+ /<a href="http:\/\/top100\.rambler\.ru\/cgi-bin\/stats_top100\.cgi\?(\d+)"/,
20
+ /<script.*src="http:\/\/counter\.rambler\.ru\/top100\.jcn\?(\d+)/,
21
+ /<img src="http:\/\/counter\.rambler\.ru\/top100\.cnt\?(\d+)"/
22
+ $1.to_i
23
+ else
24
+ nil
25
+ end
26
+ end
27
+
28
+ def run_id(id)
29
+ return nil unless id
30
+ r = {:id => id}
31
+
32
+ # doc = download("http://top100.rambler.ru/resStats/#{id}/")
33
+ doc = download("http://top100.rambler.ru/resStats/#{id}/?_export=csv&_id=#{id}&_page=0", 'UTF-16LE')
34
+
35
+ # Сегодня, Вчера, За 7 дней, На прошлой неделе, За 30 дней, В прошлом месяце
36
+
37
+ if doc =~ /уникальных\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)/
38
+ r[:visitors_day] = $2.to_i
39
+ r[:visitors_week] = $4.to_i
40
+ r[:visitors_mon] = $6.to_i
41
+ end
42
+
43
+ if doc =~ /Визитов \(сессий\)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)/
44
+ r[:visits_day] = $2.to_i
45
+ r[:visits_week] = $4.to_i
46
+ r[:visits_mon] = $6.to_i
47
+ end
48
+
49
+ if doc =~ /Просмотров страниц\n\s*всего\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)/
50
+ r[:pv_day] = $2.to_i
51
+ r[:pv_week] = $4.to_i
52
+ r[:pv_mon] = $6.to_i
53
+ end
54
+
55
+ # Plan B: if proper CSV export failed, we'll try to look up information in rating catalogue
56
+ unless r[:visitors_day]
57
+ now = Time.now
58
+ r[:visitors_day], r[:pv_day] = parse_rating_table("http://top100.rambler.ru/?range=#{spec_yesterday(now)}&stat=1&statcol=1%2C2&query=#{id}", id)
59
+ r[:visitors_week], r[:pv_week] = parse_rating_table("http://top100.rambler.ru/?range=#{spec_last_week(now)}&stat=1&statcol=1%2C2&query=#{id}", id)
60
+ r[:visitors_mon], r[:pv_mon] = parse_rating_table("http://top100.rambler.ru/?range=#{spec_last_month(now)}&stat=1&statcol=1%2C2&query=#{id}", id)
61
+ end
62
+
63
+ return r
64
+ end
65
+
66
+ def parse_rating_table(url, id)
67
+ doc = download(url)
68
+ if doc =~ /<tr>(\s*<td align="right">.*?<a href="\/resStats\/#{id}\/.*?)<\/tr>/m
69
+ table_row = $1
70
+ if table_row =~ /<td align="right">([0-9&nbsp;]+)<\/td>\s*<td class="last" align="right">([0-9&nbsp;]+)<\/td>/m
71
+ v = $1
72
+ pv = $2
73
+ v = v.gsub(/&nbsp;/, '').to_i
74
+ pv = pv.gsub(/&nbsp;/, '').to_i
75
+ return [v, pv]
76
+ end
77
+ end
78
+ return [nil, nil]
79
+ end
80
+
81
+ def spec_yesterday(now)
82
+ (now - SEC_PER_DAY).strftime('%d.%m.%Y')
83
+ end
84
+
85
+ def spec_last_week(now)
86
+ wday = now.wday
87
+ wday = 7 if wday == 0
88
+ end_week = now - (wday * SEC_PER_DAY)
89
+ start_week = end_week - 6 * SEC_PER_DAY
90
+ "#{start_week.strftime('%d.%m.%Y')}+-+#{end_week.strftime('%d.%m.%Y')}"
91
+ end
92
+
93
+ def spec_last_month(now)
94
+ this_month_1 = Time.new(now.year, now.month, 1)
95
+ last_month_end = this_month_1 - SEC_PER_DAY
96
+ last_month_start = Time.new(last_month_end.year, last_month_end.month, 1)
97
+ "#{last_month_start.strftime('%d.%m.%Y')}+-+#{last_month_end.strftime('%d.%m.%Y')}"
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,117 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ require 'web_analytics_discovery/grabberutils'
4
+
5
+ require 'date'
6
+
7
+ module WebAnalyticsDiscovery
8
+ class TNS
9
+ include GrabberUtils
10
+
11
+ def initialize
12
+ # This one requires xlsx2csv utility
13
+ begin
14
+ parser_version = `xlsx2csv --version`
15
+ rescue Errno::ENOENT
16
+ raise 'xlsx2csv not available: unable to run TNS report discovery'
17
+ end
18
+
19
+ # And an unzip utility
20
+ begin
21
+ unzip_version = `unzip -v`
22
+ rescue Errno::ENOENT
23
+ raise 'unzip not available: unable to run TNS report discovery'
24
+ end
25
+ end
26
+
27
+ # Parsing TNS report involves the following stages:
28
+ #
29
+ # 1. Download non-empty "directory" page from their web site
30
+ # for a current year (keep requesting older years if we keep
31
+ # getting empty output, bail out on HTTP error)
32
+ #
33
+ # 2. Download first (most recent) report listed on that "directory" page
34
+ #
35
+ # 3. Unpack (unzip) downloaded report file; it's a zip that
36
+ # contains multiple files, including single .xlsx file with
37
+ # raw data.
38
+ #
39
+ # 4. Convert .xlsx file into something more readable (CSV)
40
+ # with external utility.
41
+ #
42
+ # 5. Parse resulting CSV report into memory (it's relatively
43
+ # short - as of 2014-10, TNS lists only ~500 sites)
44
+ def parse_report
45
+ report_url = query_directory
46
+ zipped = download_file(report_url)
47
+ unzipped = ensure_unpack(zipped)
48
+ converted = ensure_convert(unzipped)
49
+
50
+ @report = {}
51
+ File.open(converted).each_line { |l|
52
+ c = l.chomp.split(/\t/)
53
+
54
+ # Skip headers
55
+ next if c.size < 5
56
+
57
+ # Skip table column headers
58
+ next if c[0].empty?
59
+
60
+ # Skip generic audience info columns
61
+ next if c[1].empty?
62
+
63
+ # Downcase URL and calculate proper monthly visitors
64
+ visitors = (c[2].to_f * 1000).to_i
65
+ url = c[1].downcase.gsub(/ \(сайт\)$/, '')
66
+
67
+ @report[url] = visitors
68
+ }
69
+ end
70
+
71
+ MAX_TRIES = 5
72
+
73
+ def query_directory
74
+ y = Date.today.year
75
+ MAX_TRIES.times {
76
+ dir = download("http://www.tns-global.ru/services/media/media-audience/internet/information/?arrFilter_pf%5BYEAR%5D=#{y}&set_filter=%D0%9F%D0%BE%D0%BA%D0%B0%D0%B7%D0%B0%D1%82%D1%8C&set_filter=Y")
77
+ if dir =~ /<a href="(\/services\/media\/media-audience\/internet\/information\/\?download=\d+&date=.*?)">/
78
+ return "http://www.tns-global.ru#{$1}"
79
+ end
80
+ y -= 1
81
+ }
82
+ raise 'Unable to query report directory - not a single report found'
83
+ end
84
+
85
+ def ensure_unpack(zipped)
86
+ unzipped = "#{CACHE_DIR}/tns_#{File.basename(zipped)}.xlsx"
87
+ unless File.exists?(unzipped)
88
+ system("unzip -pq '#{zipped}' *.xlsx >'#{unzipped}'")
89
+ raise 'Unable to unpack TNS report' unless $?.exitstatus == 0
90
+ end
91
+ return unzipped
92
+ end
93
+
94
+ def ensure_convert(unzipped)
95
+ converted = "#{CACHE_DIR}/#{File.basename(unzipped)}.tsv"
96
+ unless File.exists?(converted)
97
+ system("xlsx2csv -d tab -s 1 '#{unzipped}' >'#{converted}'")
98
+ raise 'Unable to convert TNS report to .tsv' unless $?.exitstatus == 0
99
+ end
100
+ return converted
101
+ end
102
+
103
+ def run(url)
104
+ run_id(find_id(url))
105
+ end
106
+
107
+ def find_id(url)
108
+ URI.parse(url).host
109
+ end
110
+
111
+ def run_id(id)
112
+ parse_report unless @report
113
+ v = @report[id]
114
+ return v ? {:id => id, :visitors_mon => v} : nil
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,54 @@
1
+ # -*- coding: UTF-8 -*-
2
+
3
+ require 'json'
4
+ require 'web_analytics_discovery/grabberutils'
5
+
6
+ module WebAnalyticsDiscovery
7
+ class YandexMetrika
8
+ include GrabberUtils
9
+
10
+ def run(url)
11
+ @page = download(url)
12
+ run_id(find_id)
13
+ end
14
+
15
+ def find_id
16
+ case @page
17
+ when /yaCounter(\d+) = new Ya\.Metrika\(\{id:(\d+)/
18
+ $1
19
+ else
20
+ nil
21
+ end
22
+ end
23
+
24
+ def run_id(id)
25
+ return nil unless id
26
+ r = {:id => id}
27
+
28
+ json = download("http://bs.yandex.ru/informer/#{id}/json")
29
+
30
+ # Unfortunately, it's very weird JSON, so it's easier to parse it with regexp
31
+ # {pageviews:[42917,576537,764371,843611,826967,1009246,990612],visits:[21298,278959,309217,335495,324285,420460,430497],uniques:[20511,240509,254201,275157,270031,356657,366913],
32
+
33
+ r[:pv_day] = do_list($1) if json =~ /pageviews:\[([0-9,]+)\]/
34
+ r[:visits_day] = do_list($1) if json =~ /visits:\[([0-9,]+)\]/
35
+ r[:visitors_day] = do_list($1) if json =~ /uniques:\[([0-9,]+)\]/
36
+
37
+ # Calculate approximations
38
+ r[:pv_week] = r[:pv_day] * 7 if r[:pv_day]
39
+ r[:pv_mon] = (r[:pv_day] * AVG_DAYS_IN_MONTH).to_i if r[:pv_day]
40
+
41
+ return r
42
+ end
43
+
44
+ def do_list(list)
45
+ els = list.split(/,/).map { |x| x.to_i }
46
+
47
+ # Throw out first element, it's current day, which is incomplete
48
+ els.shift
49
+
50
+ sum = els.inject { |a, b| a + b }
51
+ return sum / els.size
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,54 @@
1
+ require 'fileutils'
2
+ require 'digest/md5'
3
+
4
+ module GrabberUtils
5
+ CACHE_DIR = 'cache'
6
+ USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20100101 Firefox/22.0'
7
+
8
+ class DownloadError < Exception; end
9
+
10
+ def download(url, encoding = 'UTF-8', options = {})
11
+ fn = download_file(url, options)
12
+
13
+ # Truly horrible hack to work around Ruby 1.9.2+ strict handling of invalid UTF-8 characters
14
+ s = File.read(fn)
15
+ s.encode!('UTF-16', encoding, :invalid => :replace, :replace => '?')
16
+ s.encode!('UTF-8', 'UTF-16', :invalid => :replace, :replace => '?')
17
+ end
18
+
19
+ # Downloads a file, returns filename in cache directory
20
+ def download_file(url, options = {})
21
+ FileUtils.mkdir_p(CACHE_DIR)
22
+ localfile = options['localfile'] || mangle_url(url)
23
+ fn = CACHE_DIR + '/' + localfile
24
+ unless FileTest.exists?(fn)
25
+ opt = {
26
+ 'user-agent' => USER_AGENT,
27
+ 'load-cookies' => 'cookies.txt',
28
+ 'save-cookies' => 'cookies.txt',
29
+ }
30
+ if options['Referer']
31
+ opt['referer'] = options['Referer']
32
+ end
33
+ opt = opt.map { |k, v| "--#{k}='#{v}'" }.join(' ')
34
+ system("wget --append-output=wget.log --keep-session-cookies -O'#{fn}' #{opt} '#{url}'")
35
+ if $?.exitstatus != 0
36
+ File.delete(fn)
37
+ raise DownloadError.new
38
+ end
39
+ end
40
+
41
+ return fn
42
+ end
43
+
44
+ def mangle_url(url)
45
+ if url.length < 200
46
+ f = url.gsub(/[:\/]/, '_')
47
+ else
48
+ f = Digest::MD5.hexdigest(url)
49
+ end
50
+ end
51
+
52
+ # Average number of days per month
53
+ AVG_DAYS_IN_MONTH = 365.25 / 12
54
+ end