web_analytics_discovery 2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,44 @@
1
+ require 'web_analytics_discovery/grabberutils'
2
+
3
+ module WebAnalyticsDiscovery
4
+ class Openstat
5
+ include GrabberUtils
6
+
7
+ BIG_SITE_ID = 601119
8
+
9
+ def run(url)
10
+ @page = download(url)
11
+ run_id(find_id)
12
+ end
13
+
14
+ def find_id
15
+ case @page
16
+ when /<span id="(?:openstat|spylog)(\d+)"><\/span>/
17
+ $1
18
+ when /<img src=["']?http:\/\/u([0-9.]+)\.spylog\.com\/cnt/
19
+ $1.gsub(/[.]/, '').to_i
20
+ else
21
+ nil
22
+ end
23
+ end
24
+
25
+ def run_id(id)
26
+ return nil unless id
27
+ r = {:id => id}
28
+ doc = download("http://rating.openstat.ru/site/#{id}")
29
+ r[:visitors_day], r[:visits_day], r[:pv_day] = grab(doc, 'osb-rating_site-e-table-col-m-day')
30
+ r[:visitors_mon], r[:visits_mon], r[:pv_mon] = grab(doc, 'osb-rating_site-e-table-col-m-month')
31
+ return r
32
+ end
33
+
34
+ def grab(doc, classname)
35
+ a = []
36
+ doc.gsub(/#{classname}">([^<]+)<\/td>/) {
37
+ r = $1
38
+ r.gsub!(/&nbsp;/, '')
39
+ a << r.to_i
40
+ }
41
+ return a
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,84 @@
1
+ # -*- coding: UTF-8 -*-
2
+
3
+ require 'cgi'
4
+ require 'uri'
5
+ require 'json'
6
+ require 'web_analytics_discovery/grabberutils'
7
+
8
+ module WebAnalyticsDiscovery
9
+ class Quantcast
10
+ include GrabberUtils
11
+
12
+ def run(url)
13
+ uri = URI.parse(url)
14
+ run_id(uri.host)
15
+ end
16
+
17
+ def run_id(host)
18
+ r = {}
19
+
20
+ # Get auth cookies
21
+ doc = download("https://www.quantcast.com/#{host}")
22
+ if doc =~ /<td class="reach" id="reach-(.*?)">/
23
+ r[:id] = id = $1
24
+ else
25
+ return nil
26
+ end
27
+
28
+ # Quantcast has no traffic info? We should stop here
29
+ return r if doc =~ /content="We do not have enough information to provide a traffic estimate./
30
+
31
+ return run_id_quantified(r, host) if doc =~ /<h4>Quantified<\/h4>/
32
+
33
+ # Use auth cookies with API call
34
+ d = traffic_api_call(host, id, 'US', 'DAY30')
35
+
36
+ # points = d['reach']['US']['PEOPLE']['WEB']
37
+ # points.each { |pt|
38
+ # puts Time.at(pt['timestamp']).to_s + "\t" + pt['reach'].to_s
39
+ # }
40
+
41
+ r[:visitors_mon] = d['summaries']['US']['PEOPLE']['WEB']['reach'].to_i
42
+
43
+ return r
44
+ end
45
+
46
+ # Parse more precise, direct statistics on a quantified site
47
+ def run_id_quantified(r, host)
48
+ d = traffic_api_call(host, r[:id], 'GLOBAL', 'DAY1')
49
+ r[:visitors_day] = avg_last_metric(d, 'UNIQUES', 7)
50
+ r[:pv_day] = avg_last_metric(d, 'PAGE_VIEWS', 7)
51
+ r[:visits_day]= avg_last_metric(d, 'VISITS', 7)
52
+
53
+ d = traffic_api_call(host, r[:id], 'GLOBAL', 'DAY7')
54
+ r[:visitors_week] = avg_last_metric(d, 'UNIQUES', 1)
55
+ r[:pv_week] = avg_last_metric(d, 'PAGE_VIEWS', 1)
56
+ r[:visits_week]= avg_last_metric(d, 'VISITS', 1)
57
+
58
+ d = traffic_api_call(host, r[:id], 'GLOBAL', 'DAY30')['summaries']['GLOBAL']
59
+ r[:visitors_mon] = d['UNIQUES']['WEB']['reach'].to_i
60
+ r[:pv_mon] = d['PAGE_VIEWS']['WEB']['reach'].to_i
61
+ r[:visits_mon] = d['VISITS']['WEB']['reach'].to_i
62
+
63
+ return r
64
+ end
65
+
66
+ def traffic_api_call(host, id, country, period)
67
+ id_encoded = CGI::escape(id)
68
+ doc = download(
69
+ "https://www.quantcast.com/api/profile/traffic/?&wUnit=#{id_encoded}&country=#{country}&period=#{period}&countType=",
70
+ 'UTF-8',
71
+ 'Referer' => "https://www.quantcast.com/#{host}?country=#{country}",
72
+ 'X-Requested-With' => 'XMLHttpRequest'
73
+ )
74
+ return JSON.load(doc).first
75
+ end
76
+
77
+ def avg_last_metric(d, metric, last_n)
78
+ sum = 0
79
+ d['reach']['GLOBAL'][metric]['ONLINE_WEB'][-last_n..-1].each { |pt| sum += pt['reach'] }
80
+ d['reach']['GLOBAL'][metric]['MOBILE_WEB'][-last_n..-1].each { |pt| sum += pt['reach'] }
81
+ return (sum / last_n).to_i
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,100 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ require 'web_analytics_discovery/grabberutils'
4
+
5
+ module WebAnalyticsDiscovery
6
+ class Rambler
7
+ include GrabberUtils
8
+
9
+ SEC_PER_DAY = 24 * 60 * 60
10
+
11
+ def run(url)
12
+ @page = download(url)
13
+ run_id(find_id)
14
+ end
15
+
16
+ def find_id
17
+ case @page
18
+ when /_top100q.push\(\["setAccount", "(\d+)"\]\)/,
19
+ /<a href="http:\/\/top100\.rambler\.ru\/cgi-bin\/stats_top100\.cgi\?(\d+)"/,
20
+ /<script.*src="http:\/\/counter\.rambler\.ru\/top100\.jcn\?(\d+)/,
21
+ /<img src="http:\/\/counter\.rambler\.ru\/top100\.cnt\?(\d+)"/
22
+ $1.to_i
23
+ else
24
+ nil
25
+ end
26
+ end
27
+
28
+ def run_id(id)
29
+ return nil unless id
30
+ r = {:id => id}
31
+
32
+ # doc = download("http://top100.rambler.ru/resStats/#{id}/")
33
+ doc = download("http://top100.rambler.ru/resStats/#{id}/?_export=csv&_id=#{id}&_page=0", 'UTF-16LE')
34
+
35
+ # Сегодня, Вчера, За 7 дней, На прошлой неделе, За 30 дней, В прошлом месяце
36
+
37
+ if doc =~ /уникальных\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)/
38
+ r[:visitors_day] = $2.to_i
39
+ r[:visitors_week] = $4.to_i
40
+ r[:visitors_mon] = $6.to_i
41
+ end
42
+
43
+ if doc =~ /Визитов \(сессий\)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)/
44
+ r[:visits_day] = $2.to_i
45
+ r[:visits_week] = $4.to_i
46
+ r[:visits_mon] = $6.to_i
47
+ end
48
+
49
+ if doc =~ /Просмотров страниц\n\s*всего\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)/
50
+ r[:pv_day] = $2.to_i
51
+ r[:pv_week] = $4.to_i
52
+ r[:pv_mon] = $6.to_i
53
+ end
54
+
55
+ # Plan B: if proper CSV export failed, we'll try to look up information in rating catalogue
56
+ unless r[:visitors_day]
57
+ now = Time.now
58
+ r[:visitors_day], r[:pv_day] = parse_rating_table("http://top100.rambler.ru/?range=#{spec_yesterday(now)}&stat=1&statcol=1%2C2&query=#{id}", id)
59
+ r[:visitors_week], r[:pv_week] = parse_rating_table("http://top100.rambler.ru/?range=#{spec_last_week(now)}&stat=1&statcol=1%2C2&query=#{id}", id)
60
+ r[:visitors_mon], r[:pv_mon] = parse_rating_table("http://top100.rambler.ru/?range=#{spec_last_month(now)}&stat=1&statcol=1%2C2&query=#{id}", id)
61
+ end
62
+
63
+ return r
64
+ end
65
+
66
+ def parse_rating_table(url, id)
67
+ doc = download(url)
68
+ if doc =~ /<tr>(\s*<td align="right">.*?<a href="\/resStats\/#{id}\/.*?)<\/tr>/m
69
+ table_row = $1
70
+ if table_row =~ /<td align="right">([0-9&nbsp;]+)<\/td>\s*<td class="last" align="right">([0-9&nbsp;]+)<\/td>/m
71
+ v = $1
72
+ pv = $2
73
+ v = v.gsub(/&nbsp;/, '').to_i
74
+ pv = pv.gsub(/&nbsp;/, '').to_i
75
+ return [v, pv]
76
+ end
77
+ end
78
+ return [nil, nil]
79
+ end
80
+
81
+ def spec_yesterday(now)
82
+ (now - SEC_PER_DAY).strftime('%d.%m.%Y')
83
+ end
84
+
85
+ def spec_last_week(now)
86
+ wday = now.wday
87
+ wday = 7 if wday == 0
88
+ end_week = now - (wday * SEC_PER_DAY)
89
+ start_week = end_week - 6 * SEC_PER_DAY
90
+ "#{start_week.strftime('%d.%m.%Y')}+-+#{end_week.strftime('%d.%m.%Y')}"
91
+ end
92
+
93
+ def spec_last_month(now)
94
+ this_month_1 = Time.new(now.year, now.month, 1)
95
+ last_month_end = this_month_1 - SEC_PER_DAY
96
+ last_month_start = Time.new(last_month_end.year, last_month_end.month, 1)
97
+ "#{last_month_start.strftime('%d.%m.%Y')}+-+#{last_month_end.strftime('%d.%m.%Y')}"
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,117 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ require 'web_analytics_discovery/grabberutils'
4
+
5
+ require 'date'
6
+
7
+ module WebAnalyticsDiscovery
8
+ class TNS
9
+ include GrabberUtils
10
+
11
+ def initialize
12
+ # This one requires xlsx2csv utility
13
+ begin
14
+ parser_version = `xlsx2csv --version`
15
+ rescue Errno::ENOENT
16
+ raise 'xlsx2csv not available: unable to run TNS report discovery'
17
+ end
18
+
19
+ # And an unzip utility
20
+ begin
21
+ unzip_version = `unzip -v`
22
+ rescue Errno::ENOENT
23
+ raise 'unzip not available: unable to run TNS report discovery'
24
+ end
25
+ end
26
+
27
+ # Parsing TNS report involves the following stages:
28
+ #
29
+ # 1. Download non-empty "directory" page from their web site
30
+ # for a current year (keep requesting older years if we keep
31
+ # getting empty output, bail out on HTTP error)
32
+ #
33
+ # 2. Download first (most recent) report listed on that "directory" page
34
+ #
35
+ # 3. Unpack (unzip) downloaded report file; it's a zip that
36
+ # contains multiple files, including single .xlsx file with
37
+ # raw data.
38
+ #
39
+ # 4. Convert .xlsx file into something more readable (CSV)
40
+ # with external utility.
41
+ #
42
+ # 5. Parse resulting CSV report into memory (it's relatively
43
+ # short - as of 2014-10, TNS lists only ~500 sites)
44
+ def parse_report
45
+ report_url = query_directory
46
+ zipped = download_file(report_url)
47
+ unzipped = ensure_unpack(zipped)
48
+ converted = ensure_convert(unzipped)
49
+
50
+ @report = {}
51
+ File.open(converted).each_line { |l|
52
+ c = l.chomp.split(/\t/)
53
+
54
+ # Skip headers
55
+ next if c.size < 5
56
+
57
+ # Skip table column headers
58
+ next if c[0].empty?
59
+
60
+ # Skip generic audience info columns
61
+ next if c[1].empty?
62
+
63
+ # Downcase URL and calculate proper monthly visitors
64
+ visitors = (c[2].to_f * 1000).to_i
65
+ url = c[1].downcase.gsub(/ \(сайт\)$/, '')
66
+
67
+ @report[url] = visitors
68
+ }
69
+ end
70
+
71
+ MAX_TRIES = 5
72
+
73
+ def query_directory
74
+ y = Date.today.year
75
+ MAX_TRIES.times {
76
+ dir = download("http://www.tns-global.ru/services/media/media-audience/internet/information/?arrFilter_pf%5BYEAR%5D=#{y}&set_filter=%D0%9F%D0%BE%D0%BA%D0%B0%D0%B7%D0%B0%D1%82%D1%8C&set_filter=Y")
77
+ if dir =~ /<a href="(\/services\/media\/media-audience\/internet\/information\/\?download=\d+&date=.*?)">/
78
+ return "http://www.tns-global.ru#{$1}"
79
+ end
80
+ y -= 1
81
+ }
82
+ raise 'Unable to query report directory - not a single report found'
83
+ end
84
+
85
+ def ensure_unpack(zipped)
86
+ unzipped = "#{CACHE_DIR}/tns_#{File.basename(zipped)}.xlsx"
87
+ unless File.exists?(unzipped)
88
+ system("unzip -pq '#{zipped}' *.xlsx >'#{unzipped}'")
89
+ raise 'Unable to unpack TNS report' unless $?.exitstatus == 0
90
+ end
91
+ return unzipped
92
+ end
93
+
94
+ def ensure_convert(unzipped)
95
+ converted = "#{CACHE_DIR}/#{File.basename(unzipped)}.tsv"
96
+ unless File.exists?(converted)
97
+ system("xlsx2csv -d tab -s 1 '#{unzipped}' >'#{converted}'")
98
+ raise 'Unable to convert TNS report to .tsv' unless $?.exitstatus == 0
99
+ end
100
+ return converted
101
+ end
102
+
103
+ def run(url)
104
+ run_id(find_id(url))
105
+ end
106
+
107
+ def find_id(url)
108
+ URI.parse(url).host
109
+ end
110
+
111
+ def run_id(id)
112
+ parse_report unless @report
113
+ v = @report[id]
114
+ return v ? {:id => id, :visitors_mon => v} : nil
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,54 @@
1
+ # -*- coding: UTF-8 -*-
2
+
3
+ require 'json'
4
+ require 'web_analytics_discovery/grabberutils'
5
+
6
+ module WebAnalyticsDiscovery
7
+ class YandexMetrika
8
+ include GrabberUtils
9
+
10
+ def run(url)
11
+ @page = download(url)
12
+ run_id(find_id)
13
+ end
14
+
15
+ def find_id
16
+ case @page
17
+ when /yaCounter(\d+) = new Ya\.Metrika\(\{id:(\d+)/
18
+ $1
19
+ else
20
+ nil
21
+ end
22
+ end
23
+
24
+ def run_id(id)
25
+ return nil unless id
26
+ r = {:id => id}
27
+
28
+ json = download("http://bs.yandex.ru/informer/#{id}/json")
29
+
30
+ # Unfortunately, it's very weird JSON, so it's easier to parse it with regexp
31
+ # {pageviews:[42917,576537,764371,843611,826967,1009246,990612],visits:[21298,278959,309217,335495,324285,420460,430497],uniques:[20511,240509,254201,275157,270031,356657,366913],
32
+
33
+ r[:pv_day] = do_list($1) if json =~ /pageviews:\[([0-9,]+)\]/
34
+ r[:visits_day] = do_list($1) if json =~ /visits:\[([0-9,]+)\]/
35
+ r[:visitors_day] = do_list($1) if json =~ /uniques:\[([0-9,]+)\]/
36
+
37
+ # Calculate approximations
38
+ r[:pv_week] = r[:pv_day] * 7 if r[:pv_day]
39
+ r[:pv_mon] = (r[:pv_day] * AVG_DAYS_IN_MONTH).to_i if r[:pv_day]
40
+
41
+ return r
42
+ end
43
+
44
+ def do_list(list)
45
+ els = list.split(/,/).map { |x| x.to_i }
46
+
47
+ # Throw out first element, it's current day, which is incomplete
48
+ els.shift
49
+
50
+ sum = els.inject { |a, b| a + b }
51
+ return sum / els.size
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,54 @@
1
+ require 'fileutils'
2
+ require 'digest/md5'
3
+
4
+ module GrabberUtils
5
+ CACHE_DIR = 'cache'
6
+ USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20100101 Firefox/22.0'
7
+
8
+ class DownloadError < Exception; end
9
+
10
+ def download(url, encoding = 'UTF-8', options = {})
11
+ fn = download_file(url, options)
12
+
13
+ # Truly horrible hack to work around Ruby 1.9.2+ strict handling of invalid UTF-8 characters
14
+ s = File.read(fn)
15
+ s.encode!('UTF-16', encoding, :invalid => :replace, :replace => '?')
16
+ s.encode!('UTF-8', 'UTF-16', :invalid => :replace, :replace => '?')
17
+ end
18
+
19
+ # Downloads a file, returns filename in cache directory
20
+ def download_file(url, options = {})
21
+ FileUtils.mkdir_p(CACHE_DIR)
22
+ localfile = options['localfile'] || mangle_url(url)
23
+ fn = CACHE_DIR + '/' + localfile
24
+ unless FileTest.exists?(fn)
25
+ opt = {
26
+ 'user-agent' => USER_AGENT,
27
+ 'load-cookies' => 'cookies.txt',
28
+ 'save-cookies' => 'cookies.txt',
29
+ }
30
+ if options['Referer']
31
+ opt['referer'] = options['Referer']
32
+ end
33
+ opt = opt.map { |k, v| "--#{k}='#{v}'" }.join(' ')
34
+ system("wget --append-output=wget.log --keep-session-cookies -O'#{fn}' #{opt} '#{url}'")
35
+ if $?.exitstatus != 0
36
+ File.delete(fn)
37
+ raise DownloadError.new
38
+ end
39
+ end
40
+
41
+ return fn
42
+ end
43
+
44
+ def mangle_url(url)
45
+ if url.length < 200
46
+ f = url.gsub(/[:\/]/, '_')
47
+ else
48
+ f = Digest::MD5.hexdigest(url)
49
+ end
50
+ end
51
+
52
+ # Average number of days per month
53
+ AVG_DAYS_IN_MONTH = 365.25 / 12
54
+ end