pandata 0.1.0.pre
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/pandata +86 -0
- data/lib/pandata.rb +18 -0
- data/lib/pandata/argv_parser.rb +131 -0
- data/lib/pandata/data_formatter.rb +105 -0
- data/lib/pandata/data_urls.rb +20 -0
- data/lib/pandata/downloader.rb +56 -0
- data/lib/pandata/parser.rb +191 -0
- data/lib/pandata/scraper.rb +165 -0
- metadata +81 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 7f73c416a585d7b27a59799982502ab3425c0830
|
4
|
+
data.tar.gz: 88921adfb03392ac5af5f56402e1ef58e6cee767
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 17a42cf2bf107425067d52db7139abe3bb96321cdd9cc16e9041ebe8b4641264e21dbbe8040f9a2fd5cf3df8af3c161f1d51aa60ccbefc5980ab7616b12806d3
|
7
|
+
data.tar.gz: 682286a820a8f4314fe399f201e6d5362929d4b823e3b0b2c44a2cb2c09f84f1441cc7d1c10271ddedb51978103c664d492292e74e5d209c1716b0e064d67844
|
data/bin/pandata
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative '../lib/pandata'
|
4
|
+
require_relative '../lib/pandata/argv_parser'
|
5
|
+
require_relative '../lib/pandata/data_formatter'
|
6
|
+
|
7
|
+
options = Pandata::ArgvParser.parse(ARGV)
|
8
|
+
|
9
|
+
output_file = options[:output_file]
|
10
|
+
if output_file
|
11
|
+
File.delete(output_file) if File.exists?(output_file)
|
12
|
+
|
13
|
+
Object.send(:define_method, :write) do |string|
|
14
|
+
File.open(output_file, 'a') do |file|
|
15
|
+
file.write(string)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
else
|
19
|
+
def write(string)
|
20
|
+
puts string
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
if ARGV.empty?
|
25
|
+
# Print command-line usage help.
|
26
|
+
puts options[:opts]
|
27
|
+
exit
|
28
|
+
end
|
29
|
+
|
30
|
+
scraper = Pandata::Scraper.get(options[:user_id])
|
31
|
+
formatter = Pandata::DataFormatter.new
|
32
|
+
|
33
|
+
# If scraper is an array, a Pandora user could not be found with certainty.
|
34
|
+
# In this case, scraper will contain webnames similar to options[:user_id].
|
35
|
+
if scraper.kind_of?(Array)
|
36
|
+
puts "No exact match for '#{options[:user_id]}'."
|
37
|
+
|
38
|
+
unless scraper.empty?
|
39
|
+
puts "\nWebname results for '#{options[:user_id]}':"
|
40
|
+
puts formatter.list(scraper)
|
41
|
+
end
|
42
|
+
|
43
|
+
exit
|
44
|
+
end
|
45
|
+
|
46
|
+
scraper_data = {}
|
47
|
+
options[:data_to_get].each do |data_type|
|
48
|
+
if /(bookmark|like)e?d_(.*)/ =~ data_type
|
49
|
+
method = $1 << 's' # 'likes' or 'bookmarks'
|
50
|
+
argument = $2.to_sym # :tracks, :artists, :stations or :albums
|
51
|
+
scraper_data[data_type] = scraper.public_send(method, argument)
|
52
|
+
else
|
53
|
+
scraper_data[data_type] = scraper.public_send(data_type)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
if options[:return_as_json]
|
58
|
+
require 'json'
|
59
|
+
write JSON.generate(scraper_data)
|
60
|
+
exit
|
61
|
+
end
|
62
|
+
|
63
|
+
scraper_data.each do |key, value|
|
64
|
+
# Capitalize each word in the key symbol.
|
65
|
+
# e.g. :liked_tracks becomes 'Liked Tracks:'
|
66
|
+
title = key.to_s.split('_').map(&:capitalize).join(' ') << ':'
|
67
|
+
|
68
|
+
if value.empty?
|
69
|
+
output = ' ** No Data **'
|
70
|
+
else
|
71
|
+
output = case key
|
72
|
+
when /playing_station|recent_activity/
|
73
|
+
formatter.list(value)
|
74
|
+
when /liked_tracks|bookmarked_tracks/
|
75
|
+
formatter.tracks(value)
|
76
|
+
when /liked_artists|bookmarked_artists|stations|liked_stations/
|
77
|
+
formatter.sort_list(value)
|
78
|
+
when :liked_albums
|
79
|
+
formatter.albums(value)
|
80
|
+
when /following|followers/
|
81
|
+
formatter.followx(value)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
write "#{ title }\n#{ output }"
|
86
|
+
end
|
data/lib/pandata.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative 'pandata/data_urls'
|
4
|
+
require_relative 'pandata/downloader'
|
5
|
+
require_relative 'pandata/parser'
|
6
|
+
require_relative 'pandata/scraper'
|
7
|
+
|
8
|
+
module Pandata
|
9
|
+
module Version
|
10
|
+
MAJOR = 0
|
11
|
+
MINOR = 1
|
12
|
+
PATCH = 0
|
13
|
+
BUILD = 'pre'
|
14
|
+
|
15
|
+
STRING = [MAJOR, MINOR, PATCH, BUILD].compact.join('.')
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
@@ -0,0 +1,131 @@
|
|
1
|
+
require 'optparse'
|
2
|
+
require_relative '../pandata'
|
3
|
+
|
4
|
+
module Pandata
|
5
|
+
|
6
|
+
# Parses command-line input.
|
7
|
+
class ArgvParser
|
8
|
+
# Prevent instances
|
9
|
+
private_class_method :new
|
10
|
+
|
11
|
+
# Takes an ARGV (array) argument.
|
12
|
+
#
|
13
|
+
# Returns a hash with:
|
14
|
+
# - :opts (OptionParser object)
|
15
|
+
# - :user_id (string)
|
16
|
+
# - :output_file (string)
|
17
|
+
# - :data_to_get (array)
|
18
|
+
# - :get_all_data (boolean)
|
19
|
+
# - :return_as_json (boolean)
|
20
|
+
def self.parse(argv)
|
21
|
+
options = { data_to_get: [] }
|
22
|
+
get_all_data = false
|
23
|
+
|
24
|
+
options[:opts] = OptionParser.new do |opts|
|
25
|
+
opts.banner = 'Pandata: A tool for downloading Pandora.com data (likes, bookmarks, stations, etc.)'
|
26
|
+
opts.define_head 'Usage: pandata <email|webname> [options]'
|
27
|
+
opts.separator <<-END
|
28
|
+
|
29
|
+
Examples:
|
30
|
+
pandata john@example.com --liked_tracks
|
31
|
+
pandata my_webname --all -o my_pandora_data.txt
|
32
|
+
pandata my_webname -lLb --json
|
33
|
+
|
34
|
+
Options:
|
35
|
+
END
|
36
|
+
|
37
|
+
opts.on('--all', 'Get all data') do
|
38
|
+
get_all_data = true
|
39
|
+
end
|
40
|
+
|
41
|
+
opts.on('-a', '--recent_activity', 'Get recent activity') do
|
42
|
+
options[:data_to_get] << :recent_activity
|
43
|
+
end
|
44
|
+
|
45
|
+
opts.on('-B', '--bookmarked_artists', 'Get all bookmarked artists') do
|
46
|
+
options[:data_to_get] << :bookmarked_artists
|
47
|
+
end
|
48
|
+
|
49
|
+
opts.on('-b', '--bookmarked_tracks', 'Get all bookmarked tracks') do
|
50
|
+
options[:data_to_get] << :bookmarked_tracks
|
51
|
+
end
|
52
|
+
|
53
|
+
opts.on('-F', '--followers', "Get all user's followers") do
|
54
|
+
options[:data_to_get] << :followers
|
55
|
+
end
|
56
|
+
|
57
|
+
opts.on('-f', '--following', 'Get all users being followed by user') do
|
58
|
+
options[:data_to_get] << :following
|
59
|
+
end
|
60
|
+
|
61
|
+
opts.on('-j', '--json', 'Return the results as JSON') do
|
62
|
+
options[:return_as_json] = true
|
63
|
+
end
|
64
|
+
|
65
|
+
opts.on('-L', '--liked_artists', 'Get all liked artists') do
|
66
|
+
options[:data_to_get] << :liked_artists
|
67
|
+
end
|
68
|
+
|
69
|
+
opts.on('-l', '--liked_tracks', 'Get all liked tracks') do
|
70
|
+
options[:data_to_get] << :liked_tracks
|
71
|
+
end
|
72
|
+
|
73
|
+
opts.on('-m', '--liked_albums', 'Get all liked albums') do
|
74
|
+
options[:data_to_get] << :liked_albums
|
75
|
+
end
|
76
|
+
|
77
|
+
opts.on('-n', '--liked_stations', 'Get all liked stations') do
|
78
|
+
options[:data_to_get] << :liked_stations
|
79
|
+
end
|
80
|
+
|
81
|
+
opts.on('-o', '--output_file PATH', 'File to output the data into') do |path|
|
82
|
+
options[:output_file] = path
|
83
|
+
end
|
84
|
+
|
85
|
+
opts.on('-S', '--playing_station', 'Get currently playing station') do
|
86
|
+
options[:data_to_get] << :playing_station
|
87
|
+
end
|
88
|
+
|
89
|
+
opts.on('-s', '--stations', 'Get all stations') do
|
90
|
+
options[:data_to_get] << :stations
|
91
|
+
end
|
92
|
+
|
93
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
94
|
+
puts opts
|
95
|
+
exit
|
96
|
+
end
|
97
|
+
|
98
|
+
opts.on_tail("--version", "Show version") do
|
99
|
+
puts Pandata::Version::STRING
|
100
|
+
exit
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
options[:opts].parse(argv)
|
105
|
+
|
106
|
+
# User ID is the first argument.
|
107
|
+
options[:user_id] = argv.shift
|
108
|
+
|
109
|
+
if get_all_data
|
110
|
+
options[:data_to_get] = [
|
111
|
+
:recent_activity,
|
112
|
+
:playing_station,
|
113
|
+
:stations,
|
114
|
+
:bookmarked_tracks,
|
115
|
+
:bookmarked_artists,
|
116
|
+
:liked_tracks,
|
117
|
+
:liked_artists,
|
118
|
+
:liked_albums,
|
119
|
+
:liked_stations,
|
120
|
+
:followers,
|
121
|
+
:following
|
122
|
+
]
|
123
|
+
else
|
124
|
+
# Remove any duplicates caused by supplying flags multiple times.
|
125
|
+
options[:data_to_get].uniq!
|
126
|
+
end
|
127
|
+
|
128
|
+
options
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
@@ -0,0 +1,105 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module Pandata
|
4
|
+
|
5
|
+
# Sorts and formats Pandata::Scraper data as a string for printing.
|
6
|
+
class DataFormatter
|
7
|
+
|
8
|
+
# Takes an array or string and returns a string with each item on its own line.
|
9
|
+
#--
|
10
|
+
#
|
11
|
+
# Example output:
|
12
|
+
# - item1
|
13
|
+
# - item2
|
14
|
+
# - item3
|
15
|
+
#
|
16
|
+
#++
|
17
|
+
def list(data)
|
18
|
+
data = [data] unless data.kind_of?(Array)
|
19
|
+
str = ''
|
20
|
+
data.each { |item| str << " - #{item}\n" }
|
21
|
+
str
|
22
|
+
end
|
23
|
+
|
24
|
+
# Identical to #list but sorts alphabetically ignoring 'the'.
|
25
|
+
def sort_list(data)
|
26
|
+
list custom_sort(data)
|
27
|
+
end
|
28
|
+
|
29
|
+
# Takes an array of hashes with :artist and :track keys.
|
30
|
+
def tracks(tracks)
|
31
|
+
artists_items(tracks, :track)
|
32
|
+
end
|
33
|
+
|
34
|
+
# Takes an array of hashes with :artist and :album keys.
|
35
|
+
def albums(albums)
|
36
|
+
artists_items(albums, :album)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Takes an array of hashes with :name, :webname and :href keys.
|
40
|
+
def followx(data)
|
41
|
+
str = ''
|
42
|
+
data.sort_by { |item| item[:webname].downcase }.each do |hash|
|
43
|
+
str << " - name: #{hash[:name]}\n"
|
44
|
+
str << " webname: #{hash[:webname]}\n"
|
45
|
+
str << " href: #{hash[:href]}\n"
|
46
|
+
end
|
47
|
+
str
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
# Takes an array or hash.
|
53
|
+
# Sorts alphabetically ignoring the initial 'The' when sorting strings.
|
54
|
+
# Also case-insensitive to prevent lowercase names from being sorted last.
|
55
|
+
def custom_sort(enumerable)
|
56
|
+
sorted_array = enumerable.sort_by { |key, _| key.sub(/^the\s*/i, '').downcase }
|
57
|
+
|
58
|
+
# sort_by() returns an array when called on hashes.
|
59
|
+
if enumerable.kind_of?(Hash)
|
60
|
+
# Rebuild the hash.
|
61
|
+
sorted_hash = {}
|
62
|
+
sorted_array.each { |item| sorted_hash[item[0]] = item[1] }
|
63
|
+
sorted_hash
|
64
|
+
else
|
65
|
+
sorted_array
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
# Takes an array of hashes with :artist and another key belonging to an
|
70
|
+
# artist (e.g. :track or :album).
|
71
|
+
# Returns a string with each artist name on a line with the artist's items
|
72
|
+
# listed and indented below. Sorts the output, too.
|
73
|
+
#--
|
74
|
+
#
|
75
|
+
# Example output:
|
76
|
+
# - Artist1:
|
77
|
+
# - item2
|
78
|
+
# - item3
|
79
|
+
# - Artist2:
|
80
|
+
# - item1
|
81
|
+
# - item1
|
82
|
+
#
|
83
|
+
#++
|
84
|
+
def artists_items(data, item_name)
|
85
|
+
artists_items = {}
|
86
|
+
|
87
|
+
data.each do |hash|
|
88
|
+
artist_name = hash[:artist]
|
89
|
+
(artists_items[artist_name] ||= Set.new) << hash[item_name]
|
90
|
+
end
|
91
|
+
|
92
|
+
artists_items = custom_sort(artists_items)
|
93
|
+
|
94
|
+
str = ''
|
95
|
+
artists_items.each do |artist_name, items|
|
96
|
+
str << " - #{artist_name}\n"
|
97
|
+
custom_sort(items).each do |item|
|
98
|
+
str << " - #{item}\n"
|
99
|
+
end
|
100
|
+
end
|
101
|
+
str
|
102
|
+
end
|
103
|
+
|
104
|
+
end
|
105
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Pandata
|
2
|
+
# Number of results to get from a feeds.pandora.com URL.
|
3
|
+
MAX_RESULTS = 100000 # Get everything...
|
4
|
+
|
5
|
+
# URLs to Pandora's data!
|
6
|
+
DATA_FEED_URLS = {
|
7
|
+
user_search: 'http://www.pandora.com/content/connect?searchString=%{searchString}',
|
8
|
+
recent_activity: 'http://feeds.pandora.com/feeds/people/%{webname}/recentactivity.xml',
|
9
|
+
playing_station: 'http://feeds.pandora.com/feeds/people/%{webname}/nowplaying.xml',
|
10
|
+
stations: "http://feeds.pandora.com/feeds/people/%{webname}/stations.xml?max=#{MAX_RESULTS}",
|
11
|
+
bookmarked_tracks: "http://feeds.pandora.com/feeds/people/%{webname}/favorites.xml?max=#{MAX_RESULTS}",
|
12
|
+
bookmarked_artists: "http://feeds.pandora.com/feeds/people/%{webname}/favoriteartists.xml?max=#{MAX_RESULTS}",
|
13
|
+
liked_tracks: 'http://www.pandora.com/content/tracklikes?likeStartIndex=%{nextLikeStartIndex}&thumbStartIndex=%{nextThumbStartIndex}&webname=%{webname}',
|
14
|
+
liked_artists: 'http://www.pandora.com/content/artistlikes?artistStartIndex=%{nextStartIndex}&webname=%{webname}',
|
15
|
+
liked_stations: 'http://www.pandora.com/content/stationlikes?stationStartIndex=%{nextStartIndex}&webname=%{webname}',
|
16
|
+
liked_albums: 'http://www.pandora.com/content/albumlikes?albumStartIndex=%{nextStartIndex}&webname=%{webname}',
|
17
|
+
following: 'http://www.pandora.com/content/following?startIndex=%{nextStartIndex}&webname=%{webname}',
|
18
|
+
followers: 'http://www.pandora.com/content/followers?startIndex=%{nextStartIndex}&webname=%{webname}'
|
19
|
+
}
|
20
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'open-uri'
|
3
|
+
|
4
|
+
module Pandata
|
5
|
+
# Custom Pandata error
|
6
|
+
class PandataError < StandardError
|
7
|
+
end
|
8
|
+
|
9
|
+
# Retrieves data from Pandora and handles errors.
|
10
|
+
class Downloader
|
11
|
+
# A GitHub Gist that contains an updated cookie allowing access to 'login-only' visible data.
|
12
|
+
CONFIG_URL = 'https://gist.github.com/ustasb/596f1ee96d03463fde77/raw/pandata_config.json'
|
13
|
+
|
14
|
+
class << self
|
15
|
+
attr_accessor :cookie
|
16
|
+
end
|
17
|
+
|
18
|
+
# Gets a Pandora cookie and returns a Downloader instance.
|
19
|
+
def initialize
|
20
|
+
# If we already have a cookie, don't get another.
|
21
|
+
unless Downloader.cookie
|
22
|
+
Downloader.cookie = get_cookie
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
# Downloads a page and returns its content as a string.
|
27
|
+
def read_page(url)
|
28
|
+
download(url, Downloader.cookie).read
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
# Downloads a page and handles errors.
|
34
|
+
def download(url, cookie = '')
|
35
|
+
escaped_url = URI.escape(url)
|
36
|
+
|
37
|
+
begin
|
38
|
+
open(escaped_url, 'Cookie' => cookie, :read_timeout => 5)
|
39
|
+
rescue OpenURI::HTTPError => error
|
40
|
+
puts "The network request for:\n #{url}\nreturned an error:\n #{error.message}"
|
41
|
+
puts "Please try again later or update Pandata. Sorry about that!\n\nFull error:"
|
42
|
+
raise PandataError
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def get_cookie
|
47
|
+
config = JSON.parse download(CONFIG_URL).read
|
48
|
+
|
49
|
+
if Gem::Version.new(Pandata::Version::STRING) <= Gem::Version.new(config['required_update_for'])
|
50
|
+
raise PandataError, 'Pandora.com has changed something and you need to update Pandata!'
|
51
|
+
end
|
52
|
+
|
53
|
+
config['cookie']
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,191 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module Pandata
|
4
|
+
|
5
|
+
# Parses HTML/XML pages from Pandora for relevant data.
|
6
|
+
class Parser
|
7
|
+
|
8
|
+
# Returns an array of webnames.
|
9
|
+
def get_webnames_from_search(html)
|
10
|
+
user_links = Nokogiri::HTML(html).css('.user_name a')
|
11
|
+
webnames = []
|
12
|
+
|
13
|
+
user_links.each do |link|
|
14
|
+
webnames << link['webname']
|
15
|
+
end
|
16
|
+
|
17
|
+
webnames
|
18
|
+
end
|
19
|
+
|
20
|
+
# Returns the query parameters necessary to get the next page of data
|
21
|
+
# from Pandora.
|
22
|
+
def get_next_data_indices(html)
|
23
|
+
show_more = Nokogiri::HTML(html).css('.show_more')[0]
|
24
|
+
|
25
|
+
if show_more
|
26
|
+
next_indices = {}
|
27
|
+
data_attributes = ['nextStartIndex', 'nextLikeStartIndex', 'nextThumbStartIndex']
|
28
|
+
data_attributes.each do |attr_name|
|
29
|
+
attr = show_more.attributes['data-' + attr_name.downcase]
|
30
|
+
next_indices[attr_name.to_sym] = attr.value.to_i if attr
|
31
|
+
end
|
32
|
+
|
33
|
+
next_indices
|
34
|
+
else
|
35
|
+
false
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# Returns an array of recent activities.
|
40
|
+
def get_recent_activity(xml)
|
41
|
+
activity_names = []
|
42
|
+
|
43
|
+
xml_each_item(xml) do |title|
|
44
|
+
activity_names << title
|
45
|
+
end
|
46
|
+
|
47
|
+
activity_names
|
48
|
+
end
|
49
|
+
|
50
|
+
# Returns an array of station names.
|
51
|
+
def get_stations(xml)
|
52
|
+
stations = []
|
53
|
+
|
54
|
+
xml_each_item(xml) do |title|
|
55
|
+
stations << title
|
56
|
+
end
|
57
|
+
|
58
|
+
stations
|
59
|
+
end
|
60
|
+
|
61
|
+
# Returns the currently playing station name.
|
62
|
+
def get_playing_station(xml)
|
63
|
+
station = ''
|
64
|
+
|
65
|
+
xml_each_item(xml) do |title|
|
66
|
+
station = title # First title is the station name.
|
67
|
+
break
|
68
|
+
end
|
69
|
+
|
70
|
+
station
|
71
|
+
end
|
72
|
+
|
73
|
+
# Returns an array of hashes with :artist and :track keys.
|
74
|
+
def get_bookmarked_tracks(xml)
|
75
|
+
tracks = []
|
76
|
+
|
77
|
+
xml_each_item(xml) do |title|
|
78
|
+
track, artist = title.split(' by ')
|
79
|
+
tracks << { artist: artist, track: track }
|
80
|
+
end
|
81
|
+
|
82
|
+
tracks
|
83
|
+
end
|
84
|
+
|
85
|
+
# Returns an array of artist names.
|
86
|
+
def get_bookmarked_artists(xml)
|
87
|
+
artists = []
|
88
|
+
|
89
|
+
xml_each_item(xml) do |title|
|
90
|
+
artists << title
|
91
|
+
end
|
92
|
+
|
93
|
+
artists
|
94
|
+
end
|
95
|
+
|
96
|
+
# Returns an array of hashes with :artist and :track keys.
|
97
|
+
def get_liked_tracks(html)
|
98
|
+
tracks = []
|
99
|
+
|
100
|
+
infobox_each_link(html) do |title, subtitle|
|
101
|
+
tracks << { track: title, artist: subtitle }
|
102
|
+
end
|
103
|
+
|
104
|
+
tracks
|
105
|
+
end
|
106
|
+
|
107
|
+
# Returns an array of artist names.
|
108
|
+
def get_liked_artists(html)
|
109
|
+
get_infobox_titles(html)
|
110
|
+
end
|
111
|
+
|
112
|
+
# Returns an array of station names.
|
113
|
+
def get_liked_stations(html)
|
114
|
+
get_infobox_titles(html)
|
115
|
+
end
|
116
|
+
|
117
|
+
# Returns an array of hashes with :artist and :album keys.
|
118
|
+
def get_liked_albums(html)
|
119
|
+
albums = []
|
120
|
+
|
121
|
+
infobox_each_link(html) do |title, subtitle|
|
122
|
+
albums << { album: title, artist: subtitle }
|
123
|
+
end
|
124
|
+
|
125
|
+
albums
|
126
|
+
end
|
127
|
+
|
128
|
+
# Returns an array of hashes with :name, :webname and :href keys.
|
129
|
+
def get_following(html)
|
130
|
+
get_followx_users(html)
|
131
|
+
end
|
132
|
+
|
133
|
+
# Returns an array of hashes with :name, :webname and :href keys.
|
134
|
+
def get_followers(html)
|
135
|
+
get_followx_users(html)
|
136
|
+
end
|
137
|
+
|
138
|
+
private
|
139
|
+
|
140
|
+
# Loops over each 'item' tag and yields the title and description.
|
141
|
+
def xml_each_item(xml)
|
142
|
+
Nokogiri::XML(xml).css('item').each do |item|
|
143
|
+
title = item.at_css('title').text
|
144
|
+
desc = item.at_css('description').text
|
145
|
+
yield(title, desc)
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
# Loops over each .infobox container and yields the title and subtitle.
|
150
|
+
def infobox_each_link(html)
|
151
|
+
Nokogiri::HTML(html).css('.infobox').each do |infobox|
|
152
|
+
infobox_body = infobox.css('.infobox-body')
|
153
|
+
|
154
|
+
title_link = infobox_body.css('h3 a').text.strip
|
155
|
+
subtitle_link = infobox_body.css('p a').first
|
156
|
+
subtitle_link = subtitle_link.text.strip if subtitle_link
|
157
|
+
|
158
|
+
yield(title_link, subtitle_link)
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
# Returns an array of titles from #infobox_each_link.
|
163
|
+
def get_infobox_titles(html)
|
164
|
+
titles = []
|
165
|
+
infobox_each_link(html) { |title| titles << title }
|
166
|
+
titles
|
167
|
+
end
|
168
|
+
|
169
|
+
# Loops over each .follow_section container and returns a hash with
|
170
|
+
# :name, :webname and :href keys.
|
171
|
+
def get_followx_users(html)
|
172
|
+
users = []
|
173
|
+
|
174
|
+
Nokogiri::HTML(html).css('.follow_section').each do |section|
|
175
|
+
listener_name = section.css('.listener_name').first
|
176
|
+
webname = listener_name['webname']
|
177
|
+
|
178
|
+
# Remove any 'spans with a space' that sometimes appear with special characters.
|
179
|
+
listener_name.css('span').each(&:remove)
|
180
|
+
name = listener_name.text.strip
|
181
|
+
|
182
|
+
href = section.css('a').first['href']
|
183
|
+
|
184
|
+
users << { name: name, webname: webname, href: href }
|
185
|
+
end
|
186
|
+
|
187
|
+
users
|
188
|
+
end
|
189
|
+
|
190
|
+
end
|
191
|
+
end
|
@@ -0,0 +1,165 @@
|
|
1
|
+
require_relative 'data_urls'
|
2
|
+
require_relative 'parser'
|
3
|
+
require_relative 'downloader'
|
4
|
+
|
5
|
+
module Pandata
|
6
|
+
|
7
|
+
# Downloads a user's Pandora.com data.
|
8
|
+
# A user's profile must be public for Pandata to download its data.
|
9
|
+
class Scraper
|
10
|
+
|
11
|
+
# What Pandora uses to identify a user and it remains constant even if
|
12
|
+
# the user ties a new email address to their Pandora account.
|
13
|
+
attr_reader :webname
|
14
|
+
|
15
|
+
# Takes either an email or a webname string.
|
16
|
+
# Returns either:
|
17
|
+
# - a new scraper object for the supplied user ID.
|
18
|
+
# - an array of similar webnames because a matching Pandora user could not be found.
|
19
|
+
def self.get(user_id)
|
20
|
+
search_url = DATA_FEED_URLS[:user_search] % { searchString: user_id }
|
21
|
+
html = Downloader.new.read_page(search_url)
|
22
|
+
webnames = Parser.new.get_webnames_from_search(html)
|
23
|
+
|
24
|
+
if webnames.include?(user_id)
|
25
|
+
new(user_id)
|
26
|
+
# If user_id looks like an email and still gets a result.
|
27
|
+
elsif webnames.size == 1 && /.*@.*\..*/ =~ user_id
|
28
|
+
new(webnames.first)
|
29
|
+
else
|
30
|
+
webnames
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
private_class_method :new
|
35
|
+
def initialize(webname)
|
36
|
+
@downloader = Downloader.new
|
37
|
+
@parser = Parser.new
|
38
|
+
@webname = webname
|
39
|
+
end
|
40
|
+
|
41
|
+
# Returns an array of the user's recent activity.
|
42
|
+
def recent_activity
|
43
|
+
scrape_for(:recent_activity, :get_recent_activity)
|
44
|
+
end
|
45
|
+
|
46
|
+
# Returns the user's currently playing station.
|
47
|
+
def playing_station
|
48
|
+
scrape_for(:playing_station, :get_playing_station).first
|
49
|
+
end
|
50
|
+
|
51
|
+
# Returns an array of the user's stations.
|
52
|
+
def stations
|
53
|
+
scrape_for(:stations, :get_stations)
|
54
|
+
end
|
55
|
+
|
56
|
+
# Returns a user's bookmarked data.
|
57
|
+
#
|
58
|
+
# Bookmark types:
|
59
|
+
# - :artists - Returns an array of artist names.
|
60
|
+
# - :tracks - Returns an array of hashes with :artist and :track keys.
|
61
|
+
# - :all - Returns a hash with all bookmarked data.
|
62
|
+
def bookmarks(bookmark_type = :all)
|
63
|
+
case bookmark_type
|
64
|
+
when :tracks
|
65
|
+
scrape_for(:bookmarked_tracks, :get_bookmarked_tracks)
|
66
|
+
when :artists
|
67
|
+
scrape_for(:bookmarked_artists, :get_bookmarked_artists)
|
68
|
+
when :all
|
69
|
+
{ artists: bookmarks(:artists),
|
70
|
+
tracks: bookmarks(:tracks) }
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# Returns a user's liked data. (The results from giving a 'thumbs up.')
|
75
|
+
#
|
76
|
+
# Like types:
|
77
|
+
# - :artists - Returns an array of artist names.
|
78
|
+
# - :albums - Returns an array of album names.
|
79
|
+
# - :stations - Returns an array of station names.
|
80
|
+
# - :tracks - Returns an array of hashes with :artist and :track keys.
|
81
|
+
# - :all - Returns a hash with all liked data.
|
82
|
+
def likes(like_type = :all)
|
83
|
+
case like_type
|
84
|
+
when :tracks
|
85
|
+
scrape_for(:liked_tracks, :get_liked_tracks)
|
86
|
+
when :artists
|
87
|
+
scrape_for(:liked_artists, :get_liked_artists)
|
88
|
+
when :stations
|
89
|
+
scrape_for(:liked_stations, :get_liked_stations)
|
90
|
+
when :albums
|
91
|
+
scrape_for(:liked_albums, :get_liked_albums)
|
92
|
+
when :all
|
93
|
+
{ artists: likes(:artists),
|
94
|
+
albums: likes(:albums),
|
95
|
+
stations: likes(:stations),
|
96
|
+
tracks: likes(:tracks) }
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
# Returns the *public* users being followed by the user.
|
101
|
+
#
|
102
|
+
# Returns an array of hashes with keys:
|
103
|
+
# - :name - Profile name
|
104
|
+
# - :webname - Unique Pandora ID
|
105
|
+
# - :href - URL to online Pandora profile.
|
106
|
+
def following
|
107
|
+
scrape_for(:following, :get_following)
|
108
|
+
end
|
109
|
+
|
110
|
+
# Returns the user's followers in a format identical to #following.
|
111
|
+
def followers
|
112
|
+
scrape_for(:followers, :get_followers)
|
113
|
+
end
|
114
|
+
|
115
|
+
private
|
116
|
+
|
117
|
+
# Downloads all data for a given type, calls the supplied Pandata::Parser
|
118
|
+
# method and removes any duplicates.
|
119
|
+
def scrape_for(data_type, parser_method)
|
120
|
+
results = []
|
121
|
+
|
122
|
+
url = get_url(data_type)
|
123
|
+
download_all_data(url) do |html, next_data_indices|
|
124
|
+
new_data = @parser.public_send(parser_method, html)
|
125
|
+
|
126
|
+
if new_data.kind_of?(Array)
|
127
|
+
results.concat(new_data)
|
128
|
+
else
|
129
|
+
results.push(new_data)
|
130
|
+
end
|
131
|
+
|
132
|
+
get_url(data_type, next_data_indices) if next_data_indices
|
133
|
+
end
|
134
|
+
|
135
|
+
# Pandora data often contains duplicates--get rid of them.
|
136
|
+
results.uniq
|
137
|
+
end
|
138
|
+
|
139
|
+
# Downloads all data given a starting URL. Some Pandora feeds only return
|
140
|
+
# 5 - 10 items per page but contain a link to the next set of data. Threads
|
141
|
+
# cannot be used because page A be must visited to know how to obtain page B.
|
142
|
+
def download_all_data(url)
|
143
|
+
next_data_indices = {}
|
144
|
+
|
145
|
+
while next_data_indices
|
146
|
+
html = @downloader.read_page(url)
|
147
|
+
next_data_indices = @parser.get_next_data_indices(html)
|
148
|
+
url = yield(html, next_data_indices)
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
# Grabs a URL from DATA_FEED_URLS and formats it appropriately.
|
153
|
+
def get_url(data_name, next_data_indices = {})
|
154
|
+
next_data_indices = {
|
155
|
+
nextStartIndex: 0,
|
156
|
+
nextLikeStartIndex: 0,
|
157
|
+
nextThumbStartIndex: 0
|
158
|
+
} if next_data_indices.empty?
|
159
|
+
|
160
|
+
next_data_indices[:webname] = @webname
|
161
|
+
DATA_FEED_URLS[data_name] % next_data_indices
|
162
|
+
end
|
163
|
+
|
164
|
+
end
|
165
|
+
end
|
metadata
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: pandata
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0.pre
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Brian Ustas
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-03-10 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 1.5.6
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 1.5.6
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rspec
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 2.12.2
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ~>
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 2.12.2
|
41
|
+
description: A library and tool for downloading Pandora.com data (likes, bookmarks,
|
42
|
+
stations, etc.)
|
43
|
+
email: brianustas@gmail.com
|
44
|
+
executables:
|
45
|
+
- pandata
|
46
|
+
extensions: []
|
47
|
+
extra_rdoc_files: []
|
48
|
+
files:
|
49
|
+
- lib/pandata/argv_parser.rb
|
50
|
+
- lib/pandata/data_formatter.rb
|
51
|
+
- lib/pandata/data_urls.rb
|
52
|
+
- lib/pandata/downloader.rb
|
53
|
+
- lib/pandata/parser.rb
|
54
|
+
- lib/pandata/scraper.rb
|
55
|
+
- lib/pandata.rb
|
56
|
+
- bin/pandata
|
57
|
+
homepage: https://github.com/ustasb/pandata
|
58
|
+
licenses:
|
59
|
+
- MIT
|
60
|
+
metadata: {}
|
61
|
+
post_install_message:
|
62
|
+
rdoc_options: []
|
63
|
+
require_paths:
|
64
|
+
- lib
|
65
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 1.9.1
|
70
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
71
|
+
requirements:
|
72
|
+
- - '>'
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: 1.3.1
|
75
|
+
requirements: []
|
76
|
+
rubyforge_project:
|
77
|
+
rubygems_version: 2.0.2
|
78
|
+
signing_key:
|
79
|
+
specification_version: 4
|
80
|
+
summary: A Pandora.com web scraper
|
81
|
+
test_files: []
|