RubyGems - web_analytics_discovery - Versions diffs - 2.0 - Mend

web_analytics_discovery 2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

checksums.yaml +7 -0
data/.gitignore +8 -0
data/.rspec +2 -0
data/.travis.yml +9 -0
data/Gemfile +2 -0
data/LICENSE +661 -0
data/README.md +133 -0
data/Rakefile +7 -0
data/bin/web_analytics_discover +77 -0
data/lib/web_analytics_discovery.rb +23 -0
data/lib/web_analytics_discovery/grabber/alexa.rb +33 -0
data/lib/web_analytics_discovery/grabber/googleanalytics.rb +29 -0
data/lib/web_analytics_discovery/grabber/liveinternet.rb +61 -0
data/lib/web_analytics_discovery/grabber/mailru.rb +89 -0
data/lib/web_analytics_discovery/grabber/openstat.rb +44 -0
data/lib/web_analytics_discovery/grabber/quantcast.rb +84 -0
data/lib/web_analytics_discovery/grabber/rambler.rb +100 -0
data/lib/web_analytics_discovery/grabber/tns.rb +117 -0
data/lib/web_analytics_discovery/grabber/yandexmetrika.rb +54 -0
data/lib/web_analytics_discovery/grabberutils.rb +54 -0
data/lib/web_analytics_discovery/version.rb +3 -0
data/spec/alexa_spec.rb +13 -0
data/spec/liveinternet_spec.rb +15 -0
data/spec/mailru_spec.rb +36 -0
data/spec/openstat_spec.rb +24 -0
data/spec/quantcast_spec.rb +59 -0
data/spec/rambler_spec.rb +63 -0
data/spec/spec_helper.rb +25 -0
data/spec/tns_spec.rb +21 -0
data/web_analytics_discovery.gemspec +50 -0
metadata +158 -0

data/lib/web_analytics_discovery/grabber/openstat.rb ADDED

@@ -0,0 +1,44 @@
+require 'web_analytics_discovery/grabberutils'
+module WebAnalyticsDiscovery
+class Openstat
+	include GrabberUtils
+	BIG_SITE_ID = 601119
+	def run(url)
+		@page = download(url)
+		run_id(find_id)
+	end
+	def find_id
+		case @page
+		when /<span id="(?:openstat|spylog)(\d+)"><\/span>/
+			$1
+		when /<img src=["']?http:\/\/u([0-9.]+)\.spylog\.com\/cnt/
+			$1.gsub(/[.]/, '').to_i
+		else
+			nil
+		end
+	end
+	def run_id(id)
+		return nil unless id
+		r = {:id => id}
+		doc = download("http://rating.openstat.ru/site/#{id}")
+		r[:visitors_day], r[:visits_day], r[:pv_day] = grab(doc, 'osb-rating_site-e-table-col-m-day')
+		r[:visitors_mon], r[:visits_mon], r[:pv_mon] = grab(doc, 'osb-rating_site-e-table-col-m-month')
+		return r
+	end
+	def grab(doc, classname)
+		a = []
+		doc.gsub(/#{classname}">([^<]+)<\/td>/) {
+			r = $1
+			r.gsub!(/&nbsp;/, '')
+			a << r.to_i
+		}
+		return a
+	end
+end
+end

data/lib/web_analytics_discovery/grabber/quantcast.rb ADDED

@@ -0,0 +1,84 @@
+# -*- coding: UTF-8 -*-
+require 'cgi'
+require 'uri'
+require 'json'
+require 'web_analytics_discovery/grabberutils'
+module WebAnalyticsDiscovery
+class Quantcast
+	include GrabberUtils
+	def run(url)
+		uri = URI.parse(url)
+		run_id(uri.host)
+	end
+	def run_id(host)
+		r = {}
+		# Get auth cookies
+		doc = download("https://www.quantcast.com/#{host}")
+		if doc =~ /<td class="reach" id="reach-(.*?)">/
+			r[:id] = id = $1
+		else
+			return nil
+		end
+		# Quantcast has no traffic info? We should stop here
+		return r if doc =~ /content="We do not have enough information to provide a traffic estimate./
+		return run_id_quantified(r, host) if doc =~ /<h4>Quantified<\/h4>/
+		# Use auth cookies with API call
+		d = traffic_api_call(host, id, 'US', 'DAY30')
+#		points = d['reach']['US']['PEOPLE']['WEB']
+#		points.each { |pt|
+#			puts Time.at(pt['timestamp']).to_s + "\t" + pt['reach'].to_s
+#		}
+		r[:visitors_mon] = d['summaries']['US']['PEOPLE']['WEB']['reach'].to_i
+		return r
+	end
+	# Parse more precise, direct statistics on a quantified site
+	def run_id_quantified(r, host)
+		d = traffic_api_call(host, r[:id], 'GLOBAL', 'DAY1')
+		r[:visitors_day] = avg_last_metric(d, 'UNIQUES', 7)
+		r[:pv_day] = avg_last_metric(d, 'PAGE_VIEWS', 7)
+		r[:visits_day]= avg_last_metric(d, 'VISITS', 7)
+		d = traffic_api_call(host, r[:id], 'GLOBAL', 'DAY7')
+		r[:visitors_week] = avg_last_metric(d, 'UNIQUES', 1)
+		r[:pv_week] = avg_last_metric(d, 'PAGE_VIEWS', 1)
+		r[:visits_week]= avg_last_metric(d, 'VISITS', 1)
+		d = traffic_api_call(host, r[:id], 'GLOBAL', 'DAY30')['summaries']['GLOBAL']
+		r[:visitors_mon] = d['UNIQUES']['WEB']['reach'].to_i
+		r[:pv_mon] = d['PAGE_VIEWS']['WEB']['reach'].to_i
+		r[:visits_mon] = d['VISITS']['WEB']['reach'].to_i
+		return r
+	end
+	def traffic_api_call(host, id, country, period)
+		id_encoded = CGI::escape(id)
+		doc = download(
+			"https://www.quantcast.com/api/profile/traffic/?&wUnit=#{id_encoded}&country=#{country}&period=#{period}&countType=",
+			'UTF-8',
+			'Referer' => "https://www.quantcast.com/#{host}?country=#{country}",
+			'X-Requested-With' => 'XMLHttpRequest'
+		)
+		return JSON.load(doc).first
+	end
+	def avg_last_metric(d, metric, last_n)
+		sum = 0
+		d['reach']['GLOBAL'][metric]['ONLINE_WEB'][-last_n..-1].each { |pt| sum += pt['reach'] }
+		d['reach']['GLOBAL'][metric]['MOBILE_WEB'][-last_n..-1].each { |pt| sum += pt['reach'] }
+		return (sum / last_n).to_i
+	end
+end
+end

data/lib/web_analytics_discovery/grabber/rambler.rb ADDED

@@ -0,0 +1,100 @@
+# -*- coding: utf-8 -*-
+require 'web_analytics_discovery/grabberutils'
+module WebAnalyticsDiscovery
+class Rambler
+	include GrabberUtils
+	SEC_PER_DAY = 24 * 60 * 60
+	def run(url)
+		@page = download(url)
+		run_id(find_id)
+	end
+	def find_id
+		case @page
+		when /_top100q.push\(\["setAccount", "(\d+)"\]\)/,
+			/<a href="http:\/\/top100\.rambler\.ru\/cgi-bin\/stats_top100\.cgi\?(\d+)"/,
+			/<script.*src="http:\/\/counter\.rambler\.ru\/top100\.jcn\?(\d+)/,
+			/<img src="http:\/\/counter\.rambler\.ru\/top100\.cnt\?(\d+)"/
+			$1.to_i
+		else
+			nil
+		end
+	end
+	def run_id(id)
+		return nil unless id
+		r = {:id => id}
+#		doc = download("http://top100.rambler.ru/resStats/#{id}/")
+		doc = download("http://top100.rambler.ru/resStats/#{id}/?_export=csv&_id=#{id}&_page=0", 'UTF-16LE')
+		# Сегодня, Вчера, За 7 дней, На прошлой неделе, За 30 дней, В прошлом месяце
+		if doc =~ /уникальных\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)/
+			r[:visitors_day] = $2.to_i
+			r[:visitors_week] = $4.to_i
+			r[:visitors_mon] = $6.to_i
+		end
+		if doc =~ /Визитов \(сессий\)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)/
+			r[:visits_day] = $2.to_i
+			r[:visits_week] = $4.to_i
+			r[:visits_mon] = $6.to_i
+		end
+		if doc =~ /Просмотров страниц\n\s*всего\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)/
+			r[:pv_day] = $2.to_i
+			r[:pv_week] = $4.to_i
+			r[:pv_mon] = $6.to_i
+		end
+		# Plan B: if proper CSV export failed, we'll try to look up information in rating catalogue
+		unless r[:visitors_day]
+			now = Time.now
+			r[:visitors_day], r[:pv_day] = parse_rating_table("http://top100.rambler.ru/?range=#{spec_yesterday(now)}&stat=1&statcol=1%2C2&query=#{id}", id)
+			r[:visitors_week], r[:pv_week] = parse_rating_table("http://top100.rambler.ru/?range=#{spec_last_week(now)}&stat=1&statcol=1%2C2&query=#{id}", id)
+			r[:visitors_mon], r[:pv_mon] = parse_rating_table("http://top100.rambler.ru/?range=#{spec_last_month(now)}&stat=1&statcol=1%2C2&query=#{id}", id)
+		end
+		return r
+	end
+	def parse_rating_table(url, id)
+		doc = download(url)
+		if doc =~ /<tr>(\s*<td align="right">.*?<a href="\/resStats\/#{id}\/.*?)<\/tr>/m
+			table_row = $1
+			if table_row =~ /<td align="right">([0-9&nbsp;]+)<\/td>\s*<td class="last" align="right">([0-9&nbsp;]+)<\/td>/m
+				v = $1
+				pv = $2
+				v = v.gsub(/&nbsp;/, '').to_i
+				pv = pv.gsub(/&nbsp;/, '').to_i
+				return [v, pv]
+			end
+		end
+		return [nil, nil]
+	end
+	def spec_yesterday(now)
+		(now - SEC_PER_DAY).strftime('%d.%m.%Y')
+	end
+	def spec_last_week(now)
+		wday = now.wday
+		wday = 7 if wday == 0
+		end_week = now - (wday * SEC_PER_DAY)
+		start_week = end_week - 6 * SEC_PER_DAY
+		"#{start_week.strftime('%d.%m.%Y')}+-+#{end_week.strftime('%d.%m.%Y')}"
+	end
+	def spec_last_month(now)
+		this_month_1 = Time.new(now.year, now.month, 1)
+		last_month_end = this_month_1 - SEC_PER_DAY
+		last_month_start = Time.new(last_month_end.year, last_month_end.month, 1)
+		"#{last_month_start.strftime('%d.%m.%Y')}+-+#{last_month_end.strftime('%d.%m.%Y')}"
+	end
+end
+end

data/lib/web_analytics_discovery/grabber/tns.rb ADDED

@@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+require 'web_analytics_discovery/grabberutils'
+require 'date'
+module WebAnalyticsDiscovery
+class TNS
+	include GrabberUtils
+	def initialize
+		# This one requires xlsx2csv utility
+		begin
+			parser_version = `xlsx2csv --version`
+		rescue Errno::ENOENT
+			raise 'xlsx2csv not available: unable to run TNS report discovery'
+		end
+		# And an unzip utility
+		begin
+			unzip_version = `unzip -v`
+		rescue Errno::ENOENT
+			raise 'unzip not available: unable to run TNS report discovery'
+		end
+	end
+	# Parsing TNS report involves the following stages:
+	#
+	# 1. Download non-empty "directory" page from their web site
+	# for a current year (keep requesting older years if we keep
+	# getting empty output, bail out on HTTP error)
+	#
+	# 2. Download first (most recent) report listed on that "directory" page
+	#
+	# 3. Unpack (unzip) downloaded report file; it's a zip that
+	# contains multiple files, including single .xlsx file with
+	# raw data.
+	#
+	# 4. Convert .xlsx file into something more readable (CSV)
+	# with external utility.
+	#
+	# 5. Parse resulting CSV report into memory (it's relatively
+	# short - as of 2014-10, TNS lists only ~500 sites)
+	def parse_report
+		report_url = query_directory
+		zipped = download_file(report_url)
+		unzipped = ensure_unpack(zipped)
+		converted = ensure_convert(unzipped)
+		@report = {}
+		File.open(converted).each_line { |l|
+			c = l.chomp.split(/\t/)
+			# Skip headers
+			next if c.size < 5
+			# Skip table column headers
+			next if c[0].empty?
+			# Skip generic audience info columns
+			next if c[1].empty?
+			# Downcase URL and calculate proper monthly visitors
+			visitors = (c[2].to_f * 1000).to_i
+			url = c[1].downcase.gsub(/ \(сайт\)$/, '')
+			@report[url] = visitors
+		}
+	end
+	MAX_TRIES = 5
+	def query_directory
+		y = Date.today.year
+		MAX_TRIES.times {
+			dir = download("http://www.tns-global.ru/services/media/media-audience/internet/information/?arrFilter_pf%5BYEAR%5D=#{y}&set_filter=%D0%9F%D0%BE%D0%BA%D0%B0%D0%B7%D0%B0%D1%82%D1%8C&set_filter=Y")
+			if dir =~ /<a href="(\/services\/media\/media-audience\/internet\/information\/\?download=\d+&date=.*?)">/
+				return "http://www.tns-global.ru#{$1}"
+			end
+			y -= 1
+		}
+		raise 'Unable to query report directory - not a single report found'
+	end
+	def ensure_unpack(zipped)
+		unzipped = "#{CACHE_DIR}/tns_#{File.basename(zipped)}.xlsx"
+		unless File.exists?(unzipped)
+			system("unzip -pq '#{zipped}' *.xlsx >'#{unzipped}'")
+			raise 'Unable to unpack TNS report' unless $?.exitstatus == 0
+		end
+		return unzipped
+	end
+	def ensure_convert(unzipped)
+		converted = "#{CACHE_DIR}/#{File.basename(unzipped)}.tsv"
+		unless File.exists?(converted)
+			system("xlsx2csv -d tab -s 1 '#{unzipped}' >'#{converted}'")
+			raise 'Unable to convert TNS report to .tsv' unless $?.exitstatus == 0
+		end
+		return converted
+	end
+	def run(url)
+		run_id(find_id(url))
+	end
+	def find_id(url)
+		URI.parse(url).host
+	end
+	def run_id(id)
+		parse_report unless @report
+		v = @report[id]
+		return v ? {:id => id, :visitors_mon => v} : nil
+	end
+end
+end

data/lib/web_analytics_discovery/grabber/yandexmetrika.rb ADDED

@@ -0,0 +1,54 @@
+# -*- coding: UTF-8 -*-
+require 'json'
+require 'web_analytics_discovery/grabberutils'
+module WebAnalyticsDiscovery
+class YandexMetrika
+	include GrabberUtils
+	def run(url)
+		@page = download(url)
+		run_id(find_id)
+	end
+	def find_id
+		case @page
+		when /yaCounter(\d+) = new Ya\.Metrika\(\{id:(\d+)/
+			$1
+		else
+			nil
+		end
+	end
+	def run_id(id)
+		return nil unless id
+		r = {:id => id}
+		json = download("http://bs.yandex.ru/informer/#{id}/json")
+		# Unfortunately, it's very weird JSON, so it's easier to parse it with regexp
+		# {pageviews:[42917,576537,764371,843611,826967,1009246,990612],visits:[21298,278959,309217,335495,324285,420460,430497],uniques:[20511,240509,254201,275157,270031,356657,366913],
+		r[:pv_day] = do_list($1) if json =~ /pageviews:\[([0-9,]+)\]/
+		r[:visits_day] = do_list($1) if json =~ /visits:\[([0-9,]+)\]/
+		r[:visitors_day] = do_list($1) if json =~ /uniques:\[([0-9,]+)\]/
+		# Calculate approximations
+		r[:pv_week] = r[:pv_day] * 7 if r[:pv_day]
+		r[:pv_mon] = (r[:pv_day] * AVG_DAYS_IN_MONTH).to_i if r[:pv_day]
+		return r
+	end
+	def do_list(list)
+		els = list.split(/,/).map { |x| x.to_i }
+		# Throw out first element, it's current day, which is incomplete
+		els.shift
+		sum = els.inject { |a, b| a + b }
+		return sum / els.size
+	end
+end
+end

data/lib/web_analytics_discovery/grabberutils.rb ADDED

@@ -0,0 +1,54 @@
+require 'fileutils'
+require 'digest/md5'
+module GrabberUtils
+	CACHE_DIR = 'cache'
+	USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20100101 Firefox/22.0'
+	class DownloadError < Exception; end
+	def download(url, encoding = 'UTF-8', options = {})
+		fn = download_file(url, options)
+		# Truly horrible hack to work around Ruby 1.9.2+ strict handling of invalid UTF-8 characters
+		s = File.read(fn)
+		s.encode!('UTF-16', encoding, :invalid => :replace, :replace => '?')
+		s.encode!('UTF-8', 'UTF-16', :invalid => :replace, :replace => '?')
+	end
+	# Downloads a file, returns filename in cache directory
+	def download_file(url, options = {})
+		FileUtils.mkdir_p(CACHE_DIR)
+		localfile = options['localfile'] || mangle_url(url)
+		fn = CACHE_DIR + '/' + localfile
+		unless FileTest.exists?(fn)
+			opt = {
+				'user-agent' => USER_AGENT,
+				'load-cookies' => 'cookies.txt',
+				'save-cookies' => 'cookies.txt',
+			}
+			if options['Referer']
+				opt['referer'] = options['Referer']
+			end
+			opt = opt.map { |k, v| "--#{k}='#{v}'" }.join(' ')
+			system("wget --append-output=wget.log --keep-session-cookies -O'#{fn}' #{opt} '#{url}'")
+			if $?.exitstatus != 0
+				File.delete(fn)
+				raise DownloadError.new
+			end
+		end
+		return fn
+	end
+	def mangle_url(url)
+		if url.length < 200
+			f = url.gsub(/[:\/]/, '_')
+		else
+			f = Digest::MD5.hexdigest(url)
+		end
+	end
+	# Average number of days per month
+	AVG_DAYS_IN_MONTH = 365.25 / 12
+end