pandata 0.1.2 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -1
- data/bin/pandata +4 -82
- data/lib/pandata.rb +4 -2
- data/lib/pandata/argv_parser.rb +5 -5
- data/lib/pandata/cli.rb +144 -0
- data/lib/pandata/data_urls.rb +1 -1
- data/lib/pandata/downloader.rb +19 -23
- data/lib/pandata/scraper.rb +7 -3
- metadata +48 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7e2bc9075307ca70cb1e11c34ca8e817d3fea79c
|
4
|
+
data.tar.gz: bca7a313c16dd995cfd0f6094e7cb35b5f539c69
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 833e93f00606f5aac3c4a3a297f3fd7ee7f61614de1868b69d9cbbcb68fe218a43275c4087fada150e016d2da02e6bc2ef2dfb08bf368b7a389574140a41f867
|
7
|
+
data.tar.gz: 1ba8b3230c426ec828952376c79e1c4cb649bd3f936db3564ff774f51d0398b1eb9fbbc8beb229a0683ac4dda3e8ff0406cd8266f5a1f5e8af34835afb7347dc
|
data/README.md
CHANGED
@@ -57,7 +57,7 @@ Next, start scraping!
|
|
57
57
|
# Get all followers
|
58
58
|
followers = johns_scraper.followers
|
59
59
|
|
60
|
-
For more information, see the documentation for Pandata::Scraper.
|
60
|
+
For more information, see the [documentation][2] for Pandata::Scraper.
|
61
61
|
|
62
62
|
### As a Command-Line Tool
|
63
63
|
|
@@ -82,3 +82,4 @@ For an up-to-date list, check out:
|
|
82
82
|
pandata my_webname --all -o my_pandora_data.txt
|
83
83
|
|
84
84
|
[1]: http://www.pandora.com/feeds
|
85
|
+
[2]: http://rubydoc.info/gems/pandata/frames
|
data/bin/pandata
CHANGED
@@ -1,86 +1,8 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
require_relative '../lib/pandata'
|
4
|
-
require_relative '../lib/pandata/argv_parser'
|
5
|
-
require_relative '../lib/pandata/data_formatter'
|
3
|
+
require_relative '../lib/pandata/cli'
|
6
4
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
if output_file
|
11
|
-
File.delete(output_file) if File.exists?(output_file)
|
12
|
-
|
13
|
-
Object.send(:define_method, :write) do |string|
|
14
|
-
File.open(output_file, 'a') do |file|
|
15
|
-
file.puts string
|
16
|
-
end
|
17
|
-
end
|
18
|
-
else
|
19
|
-
def write(string)
|
20
|
-
puts string
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
if ARGV.empty?
|
25
|
-
# Print command-line usage help.
|
26
|
-
puts options[:opts]
|
27
|
-
exit
|
28
|
-
end
|
29
|
-
|
30
|
-
scraper = Pandata::Scraper.get(options[:user_id])
|
31
|
-
formatter = Pandata::DataFormatter.new
|
32
|
-
|
33
|
-
# If scraper is an array, a Pandora user could not be found with certainty.
|
34
|
-
# In this case, scraper will contain webnames similar to options[:user_id].
|
35
|
-
if scraper.kind_of?(Array)
|
36
|
-
puts "No exact match for '#{options[:user_id]}'."
|
37
|
-
|
38
|
-
unless scraper.empty?
|
39
|
-
puts "\nWebname results for '#{options[:user_id]}':"
|
40
|
-
puts formatter.list(scraper)
|
41
|
-
end
|
42
|
-
|
43
|
-
exit
|
44
|
-
end
|
45
|
-
|
46
|
-
scraper_data = {}
|
47
|
-
options[:data_to_get].each do |data_type|
|
48
|
-
if /(bookmark|like)e?d_(.*)/ =~ data_type
|
49
|
-
method = $1 << 's' # 'likes' or 'bookmarks'
|
50
|
-
argument = $2.to_sym # :tracks, :artists, :stations or :albums
|
51
|
-
scraper_data[data_type] = scraper.public_send(method, argument)
|
52
|
-
else
|
53
|
-
scraper_data[data_type] = scraper.public_send(data_type)
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
if options[:return_as_json]
|
58
|
-
require 'json'
|
59
|
-
write JSON.generate(scraper_data)
|
60
|
-
exit
|
61
|
-
end
|
62
|
-
|
63
|
-
scraper_data.each do |key, value|
|
64
|
-
# Capitalize each word in the key symbol.
|
65
|
-
# e.g. :liked_tracks becomes 'Liked Tracks:'
|
66
|
-
title = key.to_s.split('_').map(&:capitalize).join(' ') << ':'
|
67
|
-
|
68
|
-
if value.empty?
|
69
|
-
output = ' ** No Data **'
|
70
|
-
else
|
71
|
-
output = case key
|
72
|
-
when /playing_station|recent_activity/
|
73
|
-
formatter.list(value)
|
74
|
-
when /liked_tracks|bookmarked_tracks/
|
75
|
-
formatter.tracks(value)
|
76
|
-
when /liked_artists|bookmarked_artists|stations|liked_stations/
|
77
|
-
formatter.sort_list(value)
|
78
|
-
when :liked_albums
|
79
|
-
formatter.albums(value)
|
80
|
-
when /following|followers/
|
81
|
-
formatter.followx(value)
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
write "#{ title }\n#{ output }"
|
5
|
+
begin
|
6
|
+
Pandata::CLI.scrape(ARGV)
|
7
|
+
rescue Pandata::PandataError
|
86
8
|
end
|
data/lib/pandata.rb
CHANGED
@@ -6,10 +6,12 @@ require_relative 'pandata/parser'
|
|
6
6
|
require_relative 'pandata/scraper'
|
7
7
|
|
8
8
|
module Pandata
|
9
|
+
class PandataError < StandardError; end
|
10
|
+
|
9
11
|
module Version
|
10
12
|
MAJOR = 0
|
11
|
-
MINOR =
|
12
|
-
PATCH =
|
13
|
+
MINOR = 2
|
14
|
+
PATCH = 1
|
13
15
|
BUILD = nil
|
14
16
|
|
15
17
|
STRING = [MAJOR, MINOR, PATCH, BUILD].compact.join('.')
|
data/lib/pandata/argv_parser.rb
CHANGED
@@ -15,6 +15,8 @@ module Pandata
|
|
15
15
|
# - :output_file [String]
|
16
16
|
# - :data_to_get [Array]
|
17
17
|
# - :get_all_data [Boolean]
|
18
|
+
# - :help [Boolean]
|
19
|
+
# - :version [Boolean]
|
18
20
|
# - :return_as_json [Boolean]
|
19
21
|
def self.parse(argv)
|
20
22
|
options = { data_to_get: [] }
|
@@ -90,20 +92,18 @@ Options:
|
|
90
92
|
end
|
91
93
|
|
92
94
|
opts.on_tail("-h", "--help", "Show this message") do
|
93
|
-
|
94
|
-
exit
|
95
|
+
options[:help] = true
|
95
96
|
end
|
96
97
|
|
97
98
|
opts.on_tail("--version", "Show version") do
|
98
|
-
|
99
|
-
exit
|
99
|
+
options[:version] = true
|
100
100
|
end
|
101
101
|
end
|
102
102
|
|
103
103
|
options[:opts].parse(argv)
|
104
104
|
|
105
105
|
# User ID is the first argument.
|
106
|
-
options[:user_id] = argv
|
106
|
+
options[:user_id] = argv[0]
|
107
107
|
|
108
108
|
if get_all_data
|
109
109
|
options[:data_to_get] = [
|
data/lib/pandata/cli.rb
ADDED
@@ -0,0 +1,144 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'ruby-progressbar'
|
3
|
+
require_relative '../pandata'
|
4
|
+
require_relative 'argv_parser'
|
5
|
+
require_relative 'data_formatter'
|
6
|
+
|
7
|
+
module Pandata
|
8
|
+
|
9
|
+
# Pandata command-line interface
|
10
|
+
class CLI
|
11
|
+
|
12
|
+
def self.scrape(argv)
|
13
|
+
options = Pandata::ArgvParser.parse(argv)
|
14
|
+
|
15
|
+
if argv.empty? || options[:help]
|
16
|
+
puts options[:opts].to_s # Log usage information
|
17
|
+
elsif options[:version]
|
18
|
+
puts Pandata::Version::STRING
|
19
|
+
else
|
20
|
+
new(options).download_and_output
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def initialize(options)
|
25
|
+
@data_to_get = options[:data_to_get]
|
26
|
+
@output_file = options[:output_file]
|
27
|
+
@return_as_json = options[:return_as_json]
|
28
|
+
|
29
|
+
@scraper = scraper_for(options[:user_id])
|
30
|
+
@scraper.download_cb = method(:update_progress)
|
31
|
+
end
|
32
|
+
|
33
|
+
def update_progress(num_data)
|
34
|
+
progressbar.progress += num_data
|
35
|
+
end
|
36
|
+
|
37
|
+
def download_and_output
|
38
|
+
output_data format_data(download_data, @return_as_json)
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
def progressbar
|
44
|
+
@progressbar ||= ProgressBar.create(
|
45
|
+
title: 'Data Downloaded',
|
46
|
+
format: '%t: %c',
|
47
|
+
total: nil
|
48
|
+
)
|
49
|
+
end
|
50
|
+
|
51
|
+
def formatter
|
52
|
+
@formatter ||= DataFormatter.new
|
53
|
+
end
|
54
|
+
|
55
|
+
def log(msg)
|
56
|
+
puts msg
|
57
|
+
end
|
58
|
+
|
59
|
+
# Writes the data to STDOUT or a file.
|
60
|
+
# @param formatted_data [String]
|
61
|
+
def output_data(formatted_data)
|
62
|
+
@progressbar.stop if @progressbar
|
63
|
+
|
64
|
+
if @output_file
|
65
|
+
File.write(@output_file, formatted_data)
|
66
|
+
else
|
67
|
+
log formatted_data
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# Formats data as a string list or JSON.
|
72
|
+
# @param data [Hash]
|
73
|
+
# @param json [Boolean]
|
74
|
+
# @return [String]
|
75
|
+
def format_data(data, json = false)
|
76
|
+
if json
|
77
|
+
JSON.generate(data)
|
78
|
+
else
|
79
|
+
data.map do |category, cat_data|
|
80
|
+
# Capitalize each word in the category symbol.
|
81
|
+
# e.g. :liked_tracks becomes 'Liked Tracks'
|
82
|
+
title = category.to_s.split('_').map(&:capitalize).join(' ')
|
83
|
+
|
84
|
+
output = if cat_data.empty?
|
85
|
+
" ** No Data **\n"
|
86
|
+
else
|
87
|
+
case category
|
88
|
+
when /playing_station|recent_activity/
|
89
|
+
formatter.list(cat_data)
|
90
|
+
when /liked_tracks|bookmarked_tracks/
|
91
|
+
formatter.tracks(cat_data)
|
92
|
+
when /liked_artists|bookmarked_artists|stations|liked_stations/
|
93
|
+
formatter.sort_list(cat_data)
|
94
|
+
when :liked_albums
|
95
|
+
formatter.albums(cat_data)
|
96
|
+
when /following|followers/
|
97
|
+
formatter.followx(cat_data)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
"#{title}:\n#{output}"
|
102
|
+
end.join
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
# Downloads the user's desired data.
|
107
|
+
# @return [Hash]
|
108
|
+
def download_data
|
109
|
+
scraper_data = {}
|
110
|
+
|
111
|
+
@data_to_get.each do |data_category|
|
112
|
+
if /(bookmark|like)e?d_(.*)/ =~ data_category
|
113
|
+
method = $1 << 's' # 'likes' or 'bookmarks'
|
114
|
+
argument = $2.to_sym # :tracks, :artists, :stations or :albums
|
115
|
+
scraper_data[data_category] = @scraper.public_send(method, argument)
|
116
|
+
else
|
117
|
+
scraper_data[data_category] = @scraper.public_send(data_category)
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
scraper_data
|
122
|
+
end
|
123
|
+
|
124
|
+
# Returns a scraper for the user's id.
|
125
|
+
# @param user_id [String] webname or email
|
126
|
+
# @return [Pandata::Scraper]
|
127
|
+
def scraper_for(user_id)
|
128
|
+
scraper = Pandata::Scraper.get(user_id)
|
129
|
+
|
130
|
+
if scraper.kind_of?(Array)
|
131
|
+
log "No exact match for '#{user_id}'."
|
132
|
+
|
133
|
+
unless scraper.empty?
|
134
|
+
log "\nWebname results for '#{user_id}':\n#{formatter.list(scraper)}"
|
135
|
+
end
|
136
|
+
|
137
|
+
raise PandataError, "Could not create a scraper for '#{user_id}'."
|
138
|
+
end
|
139
|
+
|
140
|
+
scraper
|
141
|
+
end
|
142
|
+
|
143
|
+
end
|
144
|
+
end
|
data/lib/pandata/data_urls.rb
CHANGED
data/lib/pandata/downloader.rb
CHANGED
@@ -1,30 +1,23 @@
|
|
1
1
|
require 'json'
|
2
2
|
require 'open-uri'
|
3
|
+
require_relative '../pandata'
|
3
4
|
|
4
5
|
module Pandata
|
5
|
-
class PandataError < StandardError; end
|
6
6
|
|
7
|
-
# Retrieves data from Pandora.com and handles errors.
|
7
|
+
# Retrieves data from Pandora.com and handles network errors.
|
8
8
|
class Downloader
|
9
|
+
|
9
10
|
# A GitHub Gist that contains an updated cookie allowing access to 'login-only' visible data.
|
10
11
|
CONFIG_URL = 'https://gist.github.com/ustasb/596f1ee96d03463fde77/raw/pandata_config.json'
|
11
12
|
|
12
|
-
|
13
|
-
|
14
|
-
end
|
15
|
-
|
16
|
-
# Gets a Pandora cookie and returns a Downloader instance.
|
17
|
-
def initialize
|
18
|
-
unless Downloader.cookie
|
19
|
-
Downloader.cookie = get_cookie
|
20
|
-
end
|
21
|
-
end
|
13
|
+
# The cached cookie.
|
14
|
+
@@cookie = nil
|
22
15
|
|
23
16
|
# Downloads and reads a page from a URL.
|
24
17
|
# @param url [String]
|
25
18
|
# @return [String] contents of page
|
26
|
-
def read_page(url)
|
27
|
-
download(url,
|
19
|
+
def self.read_page(url)
|
20
|
+
download(url, get_cookie).read
|
28
21
|
end
|
29
22
|
|
30
23
|
private
|
@@ -33,19 +26,21 @@ module Pandata
|
|
33
26
|
# @param url [String]
|
34
27
|
# @param cookie [String]
|
35
28
|
# @return [File]
|
36
|
-
def download(url, cookie = '')
|
29
|
+
def self.download(url, cookie = '')
|
37
30
|
escaped_url = URI.escape(url)
|
38
31
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
32
|
+
open(escaped_url, 'Cookie' => cookie, :read_timeout => 5)
|
33
|
+
rescue OpenURI::HTTPError => error
|
34
|
+
puts "The network request for:\n #{url}\nreturned an error:\n #{error.message}"
|
35
|
+
puts "Please try again later or update Pandata. Sorry about that!\n\nFull error:"
|
36
|
+
raise PandataError
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.get_cookie
|
40
|
+
@@cookie ||= download_cookie
|
46
41
|
end
|
47
42
|
|
48
|
-
def
|
43
|
+
def self.download_cookie
|
49
44
|
config = JSON.parse download(CONFIG_URL).read
|
50
45
|
|
51
46
|
if Gem::Version.new(Pandata::Version::STRING) <= Gem::Version.new(config['required_update_for'])
|
@@ -54,5 +49,6 @@ module Pandata
|
|
54
49
|
|
55
50
|
config['cookie']
|
56
51
|
end
|
52
|
+
|
57
53
|
end
|
58
54
|
end
|
data/lib/pandata/scraper.rb
CHANGED
@@ -12,6 +12,9 @@ module Pandata
|
|
12
12
|
# the user ties a new email address to their Pandora account.
|
13
13
|
attr_reader :webname
|
14
14
|
|
15
|
+
# A Proc that gets called after some data has been downloaded.
|
16
|
+
attr_accessor :download_cb
|
17
|
+
|
15
18
|
# If possible, get a Scraper instance for the user_id otherwise return
|
16
19
|
# an array of similar webnames.
|
17
20
|
# @param user_id [String] email or webname
|
@@ -19,7 +22,7 @@ module Pandata
|
|
19
22
|
# @return [Array] array of similar webnames
|
20
23
|
def self.get(user_id)
|
21
24
|
search_url = DATA_FEED_URLS[:user_search] % { searchString: user_id }
|
22
|
-
html = Downloader.
|
25
|
+
html = Downloader.read_page(search_url)
|
23
26
|
webnames = Parser.new.get_webnames_from_search(html)
|
24
27
|
|
25
28
|
if webnames.include?(user_id)
|
@@ -34,7 +37,6 @@ module Pandata
|
|
34
37
|
|
35
38
|
private_class_method :new
|
36
39
|
def initialize(webname)
|
37
|
-
@downloader = Downloader.new
|
38
40
|
@parser = Parser.new
|
39
41
|
@webname = webname
|
40
42
|
end
|
@@ -134,6 +136,8 @@ module Pandata
|
|
134
136
|
results.push(new_data)
|
135
137
|
end
|
136
138
|
|
139
|
+
@download_cb[new_data.size] if @download_cb
|
140
|
+
|
137
141
|
get_url(data_type, next_data_indices) if next_data_indices
|
138
142
|
end
|
139
143
|
|
@@ -149,7 +153,7 @@ module Pandata
|
|
149
153
|
next_data_indices = {}
|
150
154
|
|
151
155
|
while next_data_indices
|
152
|
-
html =
|
156
|
+
html = Downloader.read_page(url)
|
153
157
|
next_data_indices = @parser.get_next_data_indices(html)
|
154
158
|
url = yield(html, next_data_indices)
|
155
159
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pandata
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Brian Ustas
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-09-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -24,20 +24,62 @@ dependencies:
|
|
24
24
|
- - ~>
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: 1.5.6
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: ruby-progressbar
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.2.0
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ~>
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.2.0
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: rspec
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
30
44
|
requirements:
|
31
45
|
- - ~>
|
32
46
|
- !ruby/object:Gem::Version
|
33
|
-
version: 2.
|
47
|
+
version: 2.14.0
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ~>
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 2.14.0
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: vcr
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 2.5.0
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ~>
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 2.5.0
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: webmock
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ~>
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: 1.13.0
|
34
76
|
type: :development
|
35
77
|
prerelease: false
|
36
78
|
version_requirements: !ruby/object:Gem::Requirement
|
37
79
|
requirements:
|
38
80
|
- - ~>
|
39
81
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
82
|
+
version: 1.13.0
|
41
83
|
- !ruby/object:Gem::Dependency
|
42
84
|
name: yard
|
43
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -63,6 +105,7 @@ extra_rdoc_files:
|
|
63
105
|
- README.md
|
64
106
|
files:
|
65
107
|
- lib/pandata/argv_parser.rb
|
108
|
+
- lib/pandata/cli.rb
|
66
109
|
- lib/pandata/data_formatter.rb
|
67
110
|
- lib/pandata/data_urls.rb
|
68
111
|
- lib/pandata/downloader.rb
|
@@ -92,7 +135,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
92
135
|
version: '0'
|
93
136
|
requirements: []
|
94
137
|
rubyforge_project:
|
95
|
-
rubygems_version: 2.0.
|
138
|
+
rubygems_version: 2.0.3
|
96
139
|
signing_key:
|
97
140
|
specification_version: 4
|
98
141
|
summary: A Pandora.com web scraper
|