pandata 0.1.0.pre
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/pandata +86 -0
- data/lib/pandata.rb +18 -0
- data/lib/pandata/argv_parser.rb +131 -0
- data/lib/pandata/data_formatter.rb +105 -0
- data/lib/pandata/data_urls.rb +20 -0
- data/lib/pandata/downloader.rb +56 -0
- data/lib/pandata/parser.rb +191 -0
- data/lib/pandata/scraper.rb +165 -0
- metadata +81 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 7f73c416a585d7b27a59799982502ab3425c0830
|
4
|
+
data.tar.gz: 88921adfb03392ac5af5f56402e1ef58e6cee767
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 17a42cf2bf107425067d52db7139abe3bb96321cdd9cc16e9041ebe8b4641264e21dbbe8040f9a2fd5cf3df8af3c161f1d51aa60ccbefc5980ab7616b12806d3
|
7
|
+
data.tar.gz: 682286a820a8f4314fe399f201e6d5362929d4b823e3b0b2c44a2cb2c09f84f1441cc7d1c10271ddedb51978103c664d492292e74e5d209c1716b0e064d67844
|
data/bin/pandata
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative '../lib/pandata'
|
4
|
+
require_relative '../lib/pandata/argv_parser'
|
5
|
+
require_relative '../lib/pandata/data_formatter'
|
6
|
+
|
7
|
+
options = Pandata::ArgvParser.parse(ARGV)
|
8
|
+
|
9
|
+
output_file = options[:output_file]
|
10
|
+
if output_file
|
11
|
+
File.delete(output_file) if File.exists?(output_file)
|
12
|
+
|
13
|
+
Object.send(:define_method, :write) do |string|
|
14
|
+
File.open(output_file, 'a') do |file|
|
15
|
+
file.write(string)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
else
|
19
|
+
def write(string)
|
20
|
+
puts string
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
if ARGV.empty?
|
25
|
+
# Print command-line usage help.
|
26
|
+
puts options[:opts]
|
27
|
+
exit
|
28
|
+
end
|
29
|
+
|
30
|
+
scraper = Pandata::Scraper.get(options[:user_id])
|
31
|
+
formatter = Pandata::DataFormatter.new
|
32
|
+
|
33
|
+
# If scraper is an array, a Pandora user could not be found with certainty.
|
34
|
+
# In this case, scraper will contain webnames similar to options[:user_id].
|
35
|
+
if scraper.kind_of?(Array)
|
36
|
+
puts "No exact match for '#{options[:user_id]}'."
|
37
|
+
|
38
|
+
unless scraper.empty?
|
39
|
+
puts "\nWebname results for '#{options[:user_id]}':"
|
40
|
+
puts formatter.list(scraper)
|
41
|
+
end
|
42
|
+
|
43
|
+
exit
|
44
|
+
end
|
45
|
+
|
46
|
+
scraper_data = {}
|
47
|
+
options[:data_to_get].each do |data_type|
|
48
|
+
if /(bookmark|like)e?d_(.*)/ =~ data_type
|
49
|
+
method = $1 << 's' # 'likes' or 'bookmarks'
|
50
|
+
argument = $2.to_sym # :tracks, :artists, :stations or :albums
|
51
|
+
scraper_data[data_type] = scraper.public_send(method, argument)
|
52
|
+
else
|
53
|
+
scraper_data[data_type] = scraper.public_send(data_type)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
if options[:return_as_json]
|
58
|
+
require 'json'
|
59
|
+
write JSON.generate(scraper_data)
|
60
|
+
exit
|
61
|
+
end
|
62
|
+
|
63
|
+
scraper_data.each do |key, value|
|
64
|
+
# Capitalize each word in the key symbol.
|
65
|
+
# e.g. :liked_tracks becomes 'Liked Tracks:'
|
66
|
+
title = key.to_s.split('_').map(&:capitalize).join(' ') << ':'
|
67
|
+
|
68
|
+
if value.empty?
|
69
|
+
output = ' ** No Data **'
|
70
|
+
else
|
71
|
+
output = case key
|
72
|
+
when /playing_station|recent_activity/
|
73
|
+
formatter.list(value)
|
74
|
+
when /liked_tracks|bookmarked_tracks/
|
75
|
+
formatter.tracks(value)
|
76
|
+
when /liked_artists|bookmarked_artists|stations|liked_stations/
|
77
|
+
formatter.sort_list(value)
|
78
|
+
when :liked_albums
|
79
|
+
formatter.albums(value)
|
80
|
+
when /following|followers/
|
81
|
+
formatter.followx(value)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
write "#{ title }\n#{ output }"
|
86
|
+
end
|
data/lib/pandata.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative 'pandata/data_urls'
|
4
|
+
require_relative 'pandata/downloader'
|
5
|
+
require_relative 'pandata/parser'
|
6
|
+
require_relative 'pandata/scraper'
|
7
|
+
|
8
|
+
module Pandata
|
9
|
+
module Version
|
10
|
+
MAJOR = 0
|
11
|
+
MINOR = 1
|
12
|
+
PATCH = 0
|
13
|
+
BUILD = 'pre'
|
14
|
+
|
15
|
+
STRING = [MAJOR, MINOR, PATCH, BUILD].compact.join('.')
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
@@ -0,0 +1,131 @@
|
|
1
|
+
require 'optparse'
|
2
|
+
require_relative '../pandata'
|
3
|
+
|
4
|
+
module Pandata
|
5
|
+
|
6
|
+
# Parses command-line input.
|
7
|
+
class ArgvParser
|
8
|
+
# Prevent instances
|
9
|
+
private_class_method :new
|
10
|
+
|
11
|
+
# Takes an ARGV (array) argument.
|
12
|
+
#
|
13
|
+
# Returns a hash with:
|
14
|
+
# - :opts (OptionParser object)
|
15
|
+
# - :user_id (string)
|
16
|
+
# - :output_file (string)
|
17
|
+
# - :data_to_get (array)
|
18
|
+
# - :get_all_data (boolean)
|
19
|
+
# - :return_as_json (boolean)
|
20
|
+
def self.parse(argv)
|
21
|
+
options = { data_to_get: [] }
|
22
|
+
get_all_data = false
|
23
|
+
|
24
|
+
options[:opts] = OptionParser.new do |opts|
|
25
|
+
opts.banner = 'Pandata: A tool for downloading Pandora.com data (likes, bookmarks, stations, etc.)'
|
26
|
+
opts.define_head 'Usage: pandata <email|webname> [options]'
|
27
|
+
opts.separator <<-END
|
28
|
+
|
29
|
+
Examples:
|
30
|
+
pandata john@example.com --liked_tracks
|
31
|
+
pandata my_webname --all -o my_pandora_data.txt
|
32
|
+
pandata my_webname -lLb --json
|
33
|
+
|
34
|
+
Options:
|
35
|
+
END
|
36
|
+
|
37
|
+
opts.on('--all', 'Get all data') do
|
38
|
+
get_all_data = true
|
39
|
+
end
|
40
|
+
|
41
|
+
opts.on('-a', '--recent_activity', 'Get recent activity') do
|
42
|
+
options[:data_to_get] << :recent_activity
|
43
|
+
end
|
44
|
+
|
45
|
+
opts.on('-B', '--bookmarked_artists', 'Get all bookmarked artists') do
|
46
|
+
options[:data_to_get] << :bookmarked_artists
|
47
|
+
end
|
48
|
+
|
49
|
+
opts.on('-b', '--bookmarked_tracks', 'Get all bookmarked tracks') do
|
50
|
+
options[:data_to_get] << :bookmarked_tracks
|
51
|
+
end
|
52
|
+
|
53
|
+
opts.on('-F', '--followers', "Get all user's followers") do
|
54
|
+
options[:data_to_get] << :followers
|
55
|
+
end
|
56
|
+
|
57
|
+
opts.on('-f', '--following', 'Get all users being followed by user') do
|
58
|
+
options[:data_to_get] << :following
|
59
|
+
end
|
60
|
+
|
61
|
+
opts.on('-j', '--json', 'Return the results as JSON') do
|
62
|
+
options[:return_as_json] = true
|
63
|
+
end
|
64
|
+
|
65
|
+
opts.on('-L', '--liked_artists', 'Get all liked artists') do
|
66
|
+
options[:data_to_get] << :liked_artists
|
67
|
+
end
|
68
|
+
|
69
|
+
opts.on('-l', '--liked_tracks', 'Get all liked tracks') do
|
70
|
+
options[:data_to_get] << :liked_tracks
|
71
|
+
end
|
72
|
+
|
73
|
+
opts.on('-m', '--liked_albums', 'Get all liked albums') do
|
74
|
+
options[:data_to_get] << :liked_albums
|
75
|
+
end
|
76
|
+
|
77
|
+
opts.on('-n', '--liked_stations', 'Get all liked stations') do
|
78
|
+
options[:data_to_get] << :liked_stations
|
79
|
+
end
|
80
|
+
|
81
|
+
opts.on('-o', '--output_file PATH', 'File to output the data into') do |path|
|
82
|
+
options[:output_file] = path
|
83
|
+
end
|
84
|
+
|
85
|
+
opts.on('-S', '--playing_station', 'Get currently playing station') do
|
86
|
+
options[:data_to_get] << :playing_station
|
87
|
+
end
|
88
|
+
|
89
|
+
opts.on('-s', '--stations', 'Get all stations') do
|
90
|
+
options[:data_to_get] << :stations
|
91
|
+
end
|
92
|
+
|
93
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
94
|
+
puts opts
|
95
|
+
exit
|
96
|
+
end
|
97
|
+
|
98
|
+
opts.on_tail("--version", "Show version") do
|
99
|
+
puts Pandata::Version::STRING
|
100
|
+
exit
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
options[:opts].parse(argv)
|
105
|
+
|
106
|
+
# User ID is the first argument.
|
107
|
+
options[:user_id] = argv.shift
|
108
|
+
|
109
|
+
if get_all_data
|
110
|
+
options[:data_to_get] = [
|
111
|
+
:recent_activity,
|
112
|
+
:playing_station,
|
113
|
+
:stations,
|
114
|
+
:bookmarked_tracks,
|
115
|
+
:bookmarked_artists,
|
116
|
+
:liked_tracks,
|
117
|
+
:liked_artists,
|
118
|
+
:liked_albums,
|
119
|
+
:liked_stations,
|
120
|
+
:followers,
|
121
|
+
:following
|
122
|
+
]
|
123
|
+
else
|
124
|
+
# Remove any duplicates caused by supplying flags multiple times.
|
125
|
+
options[:data_to_get].uniq!
|
126
|
+
end
|
127
|
+
|
128
|
+
options
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
@@ -0,0 +1,105 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module Pandata
|
4
|
+
|
5
|
+
# Sorts and formats Pandata::Scraper data as a string for printing.
|
6
|
+
class DataFormatter
|
7
|
+
|
8
|
+
# Takes an array or string and returns a string with each item on its own line.
|
9
|
+
#--
|
10
|
+
#
|
11
|
+
# Example output:
|
12
|
+
# - item1
|
13
|
+
# - item2
|
14
|
+
# - item3
|
15
|
+
#
|
16
|
+
#++
|
17
|
+
def list(data)
|
18
|
+
data = [data] unless data.kind_of?(Array)
|
19
|
+
str = ''
|
20
|
+
data.each { |item| str << " - #{item}\n" }
|
21
|
+
str
|
22
|
+
end
|
23
|
+
|
24
|
+
# Identical to #list but sorts alphabetically ignoring 'the'.
|
25
|
+
def sort_list(data)
|
26
|
+
list custom_sort(data)
|
27
|
+
end
|
28
|
+
|
29
|
+
# Takes an array of hashes with :artist and :track keys.
|
30
|
+
def tracks(tracks)
|
31
|
+
artists_items(tracks, :track)
|
32
|
+
end
|
33
|
+
|
34
|
+
# Takes an array of hashes with :artist and :album keys.
|
35
|
+
def albums(albums)
|
36
|
+
artists_items(albums, :album)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Takes an array of hashes with :name, :webname and :href keys.
|
40
|
+
def followx(data)
|
41
|
+
str = ''
|
42
|
+
data.sort_by { |item| item[:webname].downcase }.each do |hash|
|
43
|
+
str << " - name: #{hash[:name]}\n"
|
44
|
+
str << " webname: #{hash[:webname]}\n"
|
45
|
+
str << " href: #{hash[:href]}\n"
|
46
|
+
end
|
47
|
+
str
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
# Takes an array or hash.
|
53
|
+
# Sorts alphabetically ignoring the initial 'The' when sorting strings.
|
54
|
+
# Also case-insensitive to prevent lowercase names from being sorted last.
|
55
|
+
def custom_sort(enumerable)
|
56
|
+
sorted_array = enumerable.sort_by { |key, _| key.sub(/^the\s*/i, '').downcase }
|
57
|
+
|
58
|
+
# sort_by() returns an array when called on hashes.
|
59
|
+
if enumerable.kind_of?(Hash)
|
60
|
+
# Rebuild the hash.
|
61
|
+
sorted_hash = {}
|
62
|
+
sorted_array.each { |item| sorted_hash[item[0]] = item[1] }
|
63
|
+
sorted_hash
|
64
|
+
else
|
65
|
+
sorted_array
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
# Takes an array of hashes with :artist and another key belonging to an
|
70
|
+
# artist (e.g. :track or :album).
|
71
|
+
# Returns a string with each artist name on a line with the artist's items
|
72
|
+
# listed and indented below. Sorts the output, too.
|
73
|
+
#--
|
74
|
+
#
|
75
|
+
# Example output:
|
76
|
+
# - Artist1:
|
77
|
+
# - item2
|
78
|
+
# - item3
|
79
|
+
# - Artist2:
|
80
|
+
# - item1
|
81
|
+
# - item1
|
82
|
+
#
|
83
|
+
#++
|
84
|
+
def artists_items(data, item_name)
|
85
|
+
artists_items = {}
|
86
|
+
|
87
|
+
data.each do |hash|
|
88
|
+
artist_name = hash[:artist]
|
89
|
+
(artists_items[artist_name] ||= Set.new) << hash[item_name]
|
90
|
+
end
|
91
|
+
|
92
|
+
artists_items = custom_sort(artists_items)
|
93
|
+
|
94
|
+
str = ''
|
95
|
+
artists_items.each do |artist_name, items|
|
96
|
+
str << " - #{artist_name}\n"
|
97
|
+
custom_sort(items).each do |item|
|
98
|
+
str << " - #{item}\n"
|
99
|
+
end
|
100
|
+
end
|
101
|
+
str
|
102
|
+
end
|
103
|
+
|
104
|
+
end
|
105
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Pandata
|
2
|
+
# Number of results to get from a feeds.pandora.com URL.
|
3
|
+
MAX_RESULTS = 100000 # Get everything...
|
4
|
+
|
5
|
+
# URLs to Pandora's data!
|
6
|
+
DATA_FEED_URLS = {
|
7
|
+
user_search: 'http://www.pandora.com/content/connect?searchString=%{searchString}',
|
8
|
+
recent_activity: 'http://feeds.pandora.com/feeds/people/%{webname}/recentactivity.xml',
|
9
|
+
playing_station: 'http://feeds.pandora.com/feeds/people/%{webname}/nowplaying.xml',
|
10
|
+
stations: "http://feeds.pandora.com/feeds/people/%{webname}/stations.xml?max=#{MAX_RESULTS}",
|
11
|
+
bookmarked_tracks: "http://feeds.pandora.com/feeds/people/%{webname}/favorites.xml?max=#{MAX_RESULTS}",
|
12
|
+
bookmarked_artists: "http://feeds.pandora.com/feeds/people/%{webname}/favoriteartists.xml?max=#{MAX_RESULTS}",
|
13
|
+
liked_tracks: 'http://www.pandora.com/content/tracklikes?likeStartIndex=%{nextLikeStartIndex}&thumbStartIndex=%{nextThumbStartIndex}&webname=%{webname}',
|
14
|
+
liked_artists: 'http://www.pandora.com/content/artistlikes?artistStartIndex=%{nextStartIndex}&webname=%{webname}',
|
15
|
+
liked_stations: 'http://www.pandora.com/content/stationlikes?stationStartIndex=%{nextStartIndex}&webname=%{webname}',
|
16
|
+
liked_albums: 'http://www.pandora.com/content/albumlikes?albumStartIndex=%{nextStartIndex}&webname=%{webname}',
|
17
|
+
following: 'http://www.pandora.com/content/following?startIndex=%{nextStartIndex}&webname=%{webname}',
|
18
|
+
followers: 'http://www.pandora.com/content/followers?startIndex=%{nextStartIndex}&webname=%{webname}'
|
19
|
+
}
|
20
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'open-uri'
|
3
|
+
|
4
|
+
module Pandata
|
5
|
+
# Custom Pandata error
|
6
|
+
class PandataError < StandardError
|
7
|
+
end
|
8
|
+
|
9
|
+
# Retrieves data from Pandora and handles errors.
|
10
|
+
class Downloader
|
11
|
+
# A GitHub Gist that contains an updated cookie allowing access to 'login-only' visible data.
|
12
|
+
CONFIG_URL = 'https://gist.github.com/ustasb/596f1ee96d03463fde77/raw/pandata_config.json'
|
13
|
+
|
14
|
+
class << self
|
15
|
+
attr_accessor :cookie
|
16
|
+
end
|
17
|
+
|
18
|
+
# Gets a Pandora cookie and returns a Downloader instance.
|
19
|
+
def initialize
|
20
|
+
# If we already have a cookie, don't get another.
|
21
|
+
unless Downloader.cookie
|
22
|
+
Downloader.cookie = get_cookie
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
# Downloads a page and returns its content as a string.
|
27
|
+
def read_page(url)
|
28
|
+
download(url, Downloader.cookie).read
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
# Downloads a page and handles errors.
|
34
|
+
def download(url, cookie = '')
|
35
|
+
escaped_url = URI.escape(url)
|
36
|
+
|
37
|
+
begin
|
38
|
+
open(escaped_url, 'Cookie' => cookie, :read_timeout => 5)
|
39
|
+
rescue OpenURI::HTTPError => error
|
40
|
+
puts "The network request for:\n #{url}\nreturned an error:\n #{error.message}"
|
41
|
+
puts "Please try again later or update Pandata. Sorry about that!\n\nFull error:"
|
42
|
+
raise PandataError
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def get_cookie
|
47
|
+
config = JSON.parse download(CONFIG_URL).read
|
48
|
+
|
49
|
+
if Gem::Version.new(Pandata::Version::STRING) <= Gem::Version.new(config['required_update_for'])
|
50
|
+
raise PandataError, 'Pandora.com has changed something and you need to update Pandata!'
|
51
|
+
end
|
52
|
+
|
53
|
+
config['cookie']
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,191 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module Pandata
|
4
|
+
|
5
|
+
# Parses HTML/XML pages from Pandora for relevant data.
|
6
|
+
class Parser
|
7
|
+
|
8
|
+
# Returns an array of webnames.
|
9
|
+
def get_webnames_from_search(html)
|
10
|
+
user_links = Nokogiri::HTML(html).css('.user_name a')
|
11
|
+
webnames = []
|
12
|
+
|
13
|
+
user_links.each do |link|
|
14
|
+
webnames << link['webname']
|
15
|
+
end
|
16
|
+
|
17
|
+
webnames
|
18
|
+
end
|
19
|
+
|
20
|
+
# Returns the query parameters necessary to get the next page of data
|
21
|
+
# from Pandora.
|
22
|
+
def get_next_data_indices(html)
|
23
|
+
show_more = Nokogiri::HTML(html).css('.show_more')[0]
|
24
|
+
|
25
|
+
if show_more
|
26
|
+
next_indices = {}
|
27
|
+
data_attributes = ['nextStartIndex', 'nextLikeStartIndex', 'nextThumbStartIndex']
|
28
|
+
data_attributes.each do |attr_name|
|
29
|
+
attr = show_more.attributes['data-' + attr_name.downcase]
|
30
|
+
next_indices[attr_name.to_sym] = attr.value.to_i if attr
|
31
|
+
end
|
32
|
+
|
33
|
+
next_indices
|
34
|
+
else
|
35
|
+
false
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# Returns an array of recent activities.
|
40
|
+
def get_recent_activity(xml)
|
41
|
+
activity_names = []
|
42
|
+
|
43
|
+
xml_each_item(xml) do |title|
|
44
|
+
activity_names << title
|
45
|
+
end
|
46
|
+
|
47
|
+
activity_names
|
48
|
+
end
|
49
|
+
|
50
|
+
# Returns an array of station names.
|
51
|
+
def get_stations(xml)
|
52
|
+
stations = []
|
53
|
+
|
54
|
+
xml_each_item(xml) do |title|
|
55
|
+
stations << title
|
56
|
+
end
|
57
|
+
|
58
|
+
stations
|
59
|
+
end
|
60
|
+
|
61
|
+
# Returns the currently playing station name.
|
62
|
+
def get_playing_station(xml)
|
63
|
+
station = ''
|
64
|
+
|
65
|
+
xml_each_item(xml) do |title|
|
66
|
+
station = title # First title is the station name.
|
67
|
+
break
|
68
|
+
end
|
69
|
+
|
70
|
+
station
|
71
|
+
end
|
72
|
+
|
73
|
+
# Returns an array of hashes with :artist and :track keys.
|
74
|
+
def get_bookmarked_tracks(xml)
|
75
|
+
tracks = []
|
76
|
+
|
77
|
+
xml_each_item(xml) do |title|
|
78
|
+
track, artist = title.split(' by ')
|
79
|
+
tracks << { artist: artist, track: track }
|
80
|
+
end
|
81
|
+
|
82
|
+
tracks
|
83
|
+
end
|
84
|
+
|
85
|
+
# Returns an array of artist names.
|
86
|
+
def get_bookmarked_artists(xml)
|
87
|
+
artists = []
|
88
|
+
|
89
|
+
xml_each_item(xml) do |title|
|
90
|
+
artists << title
|
91
|
+
end
|
92
|
+
|
93
|
+
artists
|
94
|
+
end
|
95
|
+
|
96
|
+
# Returns an array of hashes with :artist and :track keys.
|
97
|
+
def get_liked_tracks(html)
|
98
|
+
tracks = []
|
99
|
+
|
100
|
+
infobox_each_link(html) do |title, subtitle|
|
101
|
+
tracks << { track: title, artist: subtitle }
|
102
|
+
end
|
103
|
+
|
104
|
+
tracks
|
105
|
+
end
|
106
|
+
|
107
|
+
# Returns an array of artist names.
|
108
|
+
def get_liked_artists(html)
|
109
|
+
get_infobox_titles(html)
|
110
|
+
end
|
111
|
+
|
112
|
+
# Returns an array of station names.
|
113
|
+
def get_liked_stations(html)
|
114
|
+
get_infobox_titles(html)
|
115
|
+
end
|
116
|
+
|
117
|
+
# Returns an array of hashes with :artist and :album keys.
|
118
|
+
def get_liked_albums(html)
|
119
|
+
albums = []
|
120
|
+
|
121
|
+
infobox_each_link(html) do |title, subtitle|
|
122
|
+
albums << { album: title, artist: subtitle }
|
123
|
+
end
|
124
|
+
|
125
|
+
albums
|
126
|
+
end
|
127
|
+
|
128
|
+
# Returns an array of hashes with :name, :webname and :href keys.
|
129
|
+
def get_following(html)
|
130
|
+
get_followx_users(html)
|
131
|
+
end
|
132
|
+
|
133
|
+
# Returns an array of hashes with :name, :webname and :href keys.
|
134
|
+
def get_followers(html)
|
135
|
+
get_followx_users(html)
|
136
|
+
end
|
137
|
+
|
138
|
+
private
|
139
|
+
|
140
|
+
# Loops over each 'item' tag and yields the title and description.
|
141
|
+
def xml_each_item(xml)
|
142
|
+
Nokogiri::XML(xml).css('item').each do |item|
|
143
|
+
title = item.at_css('title').text
|
144
|
+
desc = item.at_css('description').text
|
145
|
+
yield(title, desc)
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
# Loops over each .infobox container and yields the title and subtitle.
|
150
|
+
def infobox_each_link(html)
|
151
|
+
Nokogiri::HTML(html).css('.infobox').each do |infobox|
|
152
|
+
infobox_body = infobox.css('.infobox-body')
|
153
|
+
|
154
|
+
title_link = infobox_body.css('h3 a').text.strip
|
155
|
+
subtitle_link = infobox_body.css('p a').first
|
156
|
+
subtitle_link = subtitle_link.text.strip if subtitle_link
|
157
|
+
|
158
|
+
yield(title_link, subtitle_link)
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
# Returns an array of titles from #infobox_each_link.
|
163
|
+
def get_infobox_titles(html)
|
164
|
+
titles = []
|
165
|
+
infobox_each_link(html) { |title| titles << title }
|
166
|
+
titles
|
167
|
+
end
|
168
|
+
|
169
|
+
# Loops over each .follow_section container and returns a hash with
|
170
|
+
# :name, :webname and :href keys.
|
171
|
+
def get_followx_users(html)
|
172
|
+
users = []
|
173
|
+
|
174
|
+
Nokogiri::HTML(html).css('.follow_section').each do |section|
|
175
|
+
listener_name = section.css('.listener_name').first
|
176
|
+
webname = listener_name['webname']
|
177
|
+
|
178
|
+
# Remove any 'spans with a space' that sometimes appear with special characters.
|
179
|
+
listener_name.css('span').each(&:remove)
|
180
|
+
name = listener_name.text.strip
|
181
|
+
|
182
|
+
href = section.css('a').first['href']
|
183
|
+
|
184
|
+
users << { name: name, webname: webname, href: href }
|
185
|
+
end
|
186
|
+
|
187
|
+
users
|
188
|
+
end
|
189
|
+
|
190
|
+
end
|
191
|
+
end
|
@@ -0,0 +1,165 @@
|
|
1
|
+
require_relative 'data_urls'
|
2
|
+
require_relative 'parser'
|
3
|
+
require_relative 'downloader'
|
4
|
+
|
5
|
+
module Pandata
|
6
|
+
|
7
|
+
# Downloads a user's Pandora.com data.
|
8
|
+
# A user's profile must be public for Pandata to download its data.
|
9
|
+
class Scraper
|
10
|
+
|
11
|
+
# What Pandora uses to identify a user and it remains constant even if
|
12
|
+
# the user ties a new email address to their Pandora account.
|
13
|
+
attr_reader :webname
|
14
|
+
|
15
|
+
# Takes either an email or a webname string.
|
16
|
+
# Returns either:
|
17
|
+
# - a new scraper object for the supplied user ID.
|
18
|
+
# - an array of similar webnames because a matching Pandora user could not be found.
|
19
|
+
def self.get(user_id)
|
20
|
+
search_url = DATA_FEED_URLS[:user_search] % { searchString: user_id }
|
21
|
+
html = Downloader.new.read_page(search_url)
|
22
|
+
webnames = Parser.new.get_webnames_from_search(html)
|
23
|
+
|
24
|
+
if webnames.include?(user_id)
|
25
|
+
new(user_id)
|
26
|
+
# If user_id looks like an email and still gets a result.
|
27
|
+
elsif webnames.size == 1 && /.*@.*\..*/ =~ user_id
|
28
|
+
new(webnames.first)
|
29
|
+
else
|
30
|
+
webnames
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
private_class_method :new
|
35
|
+
def initialize(webname)
|
36
|
+
@downloader = Downloader.new
|
37
|
+
@parser = Parser.new
|
38
|
+
@webname = webname
|
39
|
+
end
|
40
|
+
|
41
|
+
# Returns an array of the user's recent activity.
|
42
|
+
def recent_activity
|
43
|
+
scrape_for(:recent_activity, :get_recent_activity)
|
44
|
+
end
|
45
|
+
|
46
|
+
# Returns the user's currently playing station.
|
47
|
+
def playing_station
|
48
|
+
scrape_for(:playing_station, :get_playing_station).first
|
49
|
+
end
|
50
|
+
|
51
|
+
# Returns an array of the user's stations.
|
52
|
+
def stations
|
53
|
+
scrape_for(:stations, :get_stations)
|
54
|
+
end
|
55
|
+
|
56
|
+
# Returns a user's bookmarked data.
|
57
|
+
#
|
58
|
+
# Bookmark types:
|
59
|
+
# - :artists - Returns an array of artist names.
|
60
|
+
# - :tracks - Returns an array of hashes with :artist and :track keys.
|
61
|
+
# - :all - Returns a hash with all bookmarked data.
|
62
|
+
def bookmarks(bookmark_type = :all)
|
63
|
+
case bookmark_type
|
64
|
+
when :tracks
|
65
|
+
scrape_for(:bookmarked_tracks, :get_bookmarked_tracks)
|
66
|
+
when :artists
|
67
|
+
scrape_for(:bookmarked_artists, :get_bookmarked_artists)
|
68
|
+
when :all
|
69
|
+
{ artists: bookmarks(:artists),
|
70
|
+
tracks: bookmarks(:tracks) }
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# Returns a user's liked data. (The results from giving a 'thumbs up.')
|
75
|
+
#
|
76
|
+
# Like types:
|
77
|
+
# - :artists - Returns an array of artist names.
|
78
|
+
# - :albums - Returns an array of album names.
|
79
|
+
# - :stations - Returns an array of station names.
|
80
|
+
# - :tracks - Returns an array of hashes with :artist and :track keys.
|
81
|
+
# - :all - Returns a hash with all liked data.
|
82
|
+
def likes(like_type = :all)
|
83
|
+
case like_type
|
84
|
+
when :tracks
|
85
|
+
scrape_for(:liked_tracks, :get_liked_tracks)
|
86
|
+
when :artists
|
87
|
+
scrape_for(:liked_artists, :get_liked_artists)
|
88
|
+
when :stations
|
89
|
+
scrape_for(:liked_stations, :get_liked_stations)
|
90
|
+
when :albums
|
91
|
+
scrape_for(:liked_albums, :get_liked_albums)
|
92
|
+
when :all
|
93
|
+
{ artists: likes(:artists),
|
94
|
+
albums: likes(:albums),
|
95
|
+
stations: likes(:stations),
|
96
|
+
tracks: likes(:tracks) }
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
# Returns the *public* users being followed by the user.
|
101
|
+
#
|
102
|
+
# Returns an array of hashes with keys:
|
103
|
+
# - :name - Profile name
|
104
|
+
# - :webname - Unique Pandora ID
|
105
|
+
# - :href - URL to online Pandora profile.
|
106
|
+
def following
|
107
|
+
scrape_for(:following, :get_following)
|
108
|
+
end
|
109
|
+
|
110
|
+
# Returns the user's followers in a format identical to #following.
|
111
|
+
def followers
|
112
|
+
scrape_for(:followers, :get_followers)
|
113
|
+
end
|
114
|
+
|
115
|
+
private
|
116
|
+
|
117
|
+
# Downloads all data for a given type, calls the supplied Pandata::Parser
|
118
|
+
# method and removes any duplicates.
|
119
|
+
def scrape_for(data_type, parser_method)
|
120
|
+
results = []
|
121
|
+
|
122
|
+
url = get_url(data_type)
|
123
|
+
download_all_data(url) do |html, next_data_indices|
|
124
|
+
new_data = @parser.public_send(parser_method, html)
|
125
|
+
|
126
|
+
if new_data.kind_of?(Array)
|
127
|
+
results.concat(new_data)
|
128
|
+
else
|
129
|
+
results.push(new_data)
|
130
|
+
end
|
131
|
+
|
132
|
+
get_url(data_type, next_data_indices) if next_data_indices
|
133
|
+
end
|
134
|
+
|
135
|
+
# Pandora data often contains duplicates--get rid of them.
|
136
|
+
results.uniq
|
137
|
+
end
|
138
|
+
|
139
|
+
# Downloads all data given a starting URL. Some Pandora feeds only return
|
140
|
+
# 5 - 10 items per page but contain a link to the next set of data. Threads
|
141
|
+
# cannot be used because page A be must visited to know how to obtain page B.
|
142
|
+
def download_all_data(url)
|
143
|
+
next_data_indices = {}
|
144
|
+
|
145
|
+
while next_data_indices
|
146
|
+
html = @downloader.read_page(url)
|
147
|
+
next_data_indices = @parser.get_next_data_indices(html)
|
148
|
+
url = yield(html, next_data_indices)
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
# Grabs a URL from DATA_FEED_URLS and formats it appropriately.
|
153
|
+
def get_url(data_name, next_data_indices = {})
|
154
|
+
next_data_indices = {
|
155
|
+
nextStartIndex: 0,
|
156
|
+
nextLikeStartIndex: 0,
|
157
|
+
nextThumbStartIndex: 0
|
158
|
+
} if next_data_indices.empty?
|
159
|
+
|
160
|
+
next_data_indices[:webname] = @webname
|
161
|
+
DATA_FEED_URLS[data_name] % next_data_indices
|
162
|
+
end
|
163
|
+
|
164
|
+
end
|
165
|
+
end
|
metadata
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: pandata
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0.pre
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Brian Ustas
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-03-10 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 1.5.6
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 1.5.6
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rspec
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 2.12.2
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ~>
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 2.12.2
|
41
|
+
description: A library and tool for downloading Pandora.com data (likes, bookmarks,
|
42
|
+
stations, etc.)
|
43
|
+
email: brianustas@gmail.com
|
44
|
+
executables:
|
45
|
+
- pandata
|
46
|
+
extensions: []
|
47
|
+
extra_rdoc_files: []
|
48
|
+
files:
|
49
|
+
- lib/pandata/argv_parser.rb
|
50
|
+
- lib/pandata/data_formatter.rb
|
51
|
+
- lib/pandata/data_urls.rb
|
52
|
+
- lib/pandata/downloader.rb
|
53
|
+
- lib/pandata/parser.rb
|
54
|
+
- lib/pandata/scraper.rb
|
55
|
+
- lib/pandata.rb
|
56
|
+
- bin/pandata
|
57
|
+
homepage: https://github.com/ustasb/pandata
|
58
|
+
licenses:
|
59
|
+
- MIT
|
60
|
+
metadata: {}
|
61
|
+
post_install_message:
|
62
|
+
rdoc_options: []
|
63
|
+
require_paths:
|
64
|
+
- lib
|
65
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 1.9.1
|
70
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
71
|
+
requirements:
|
72
|
+
- - '>'
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: 1.3.1
|
75
|
+
requirements: []
|
76
|
+
rubyforge_project:
|
77
|
+
rubygems_version: 2.0.2
|
78
|
+
signing_key:
|
79
|
+
specification_version: 4
|
80
|
+
summary: A Pandora.com web scraper
|
81
|
+
test_files: []
|