findart 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +3 -0
- data/Manifest.txt +33 -0
- data/README.rdoc +63 -0
- data/Rakefile +30 -0
- data/bin/findart +55 -0
- data/features/development.feature +13 -0
- data/features/findalbumarturls.feature +38 -0
- data/features/fixtures/search-albumartexchange-daft-punk-discovery.html +107 -0
- data/features/registerdscapers.feature +12 -0
- data/features/step_definitions/common_steps.rb +169 -0
- data/features/step_definitions/findalbumarturls_steps.rb +9 -0
- data/features/support/common.rb +29 -0
- data/features/support/env.rb +14 -0
- data/features/support/matchers.rb +11 -0
- data/lib/FindArt.rb +20 -0
- data/lib/FindArt/scraper.rb +76 -0
- data/lib/FindArt/scrapers/albumartexchange.rb +22 -0
- data/lib/FindArt/scrapers/amazon.rb.disabled +47 -0
- data/lib/FindArt/scrapers/discogs.rb +44 -0
- data/lib/FindArt/scrapers/junodownload.rb +31 -0
- data/lib/FindArt/scrapers/walmart.rb +38 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/spec/albumartexchange_spec.rb +26 -0
- data/spec/amazon_spec.rb +34 -0
- data/spec/discogs_spec.rb +32 -0
- data/spec/junodownload_spec.rb +32 -0
- data/spec/scraper_spec.rb +121 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +11 -0
- data/spec/walmart_spec.rb +26 -0
- data/tasks/rspec.rake +25 -0
- metadata +164 -0
@@ -0,0 +1,29 @@
|
|
1
|
+
module CommonHelpers
|
2
|
+
def in_tmp_folder(&block)
|
3
|
+
FileUtils.chdir(@tmp_root, &block)
|
4
|
+
end
|
5
|
+
|
6
|
+
def in_project_folder(&block)
|
7
|
+
project_folder = @active_project_folder || @tmp_root
|
8
|
+
FileUtils.chdir(project_folder, &block)
|
9
|
+
end
|
10
|
+
|
11
|
+
def in_home_folder(&block)
|
12
|
+
FileUtils.chdir(@home_path, &block)
|
13
|
+
end
|
14
|
+
|
15
|
+
def force_local_lib_override(project_name = @project_name)
|
16
|
+
rakefile = File.read(File.join(project_name, 'Rakefile'))
|
17
|
+
File.open(File.join(project_name, 'Rakefile'), "w+") do |f|
|
18
|
+
f << "$:.unshift('#{@lib_path}')\n"
|
19
|
+
f << rakefile
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def setup_active_project_folder project_name
|
24
|
+
@active_project_folder = File.join(@tmp_root, project_name)
|
25
|
+
@project_name = project_name
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
World(CommonHelpers)
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../lib/FindArt"
|
2
|
+
|
3
|
+
gem 'cucumber'
|
4
|
+
require 'cucumber'
|
5
|
+
gem 'rspec'
|
6
|
+
require 'spec'
|
7
|
+
|
8
|
+
Before do
|
9
|
+
@tmp_root = File.dirname(__FILE__) + "/../../tmp"
|
10
|
+
@home_path = File.expand_path(File.join(@tmp_root, "home"))
|
11
|
+
FileUtils.rm_rf @tmp_root
|
12
|
+
FileUtils.mkdir_p @home_path
|
13
|
+
ENV['HOME'] = @home_path
|
14
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
module Matchers
|
2
|
+
def contain(expected)
|
3
|
+
simple_matcher("contain #{expected.inspect}") do |given, matcher|
|
4
|
+
matcher.failure_message = "expected #{given.inspect} to contain #{expected.inspect}"
|
5
|
+
matcher.negative_failure_message = "expected #{given.inspect} not to contain #{expected.inspect}"
|
6
|
+
given.index expected
|
7
|
+
end
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
World(Matchers)
|
data/lib/FindArt.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__)) unless
|
2
|
+
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'mechanize'
|
6
|
+
require 'hpricot'
|
7
|
+
|
8
|
+
#meta programming patch
|
9
|
+
class Object # http://whytheluckystiff.net/articles/seeingMetaclassesClearly.html
|
10
|
+
def meta_def name, &blk
|
11
|
+
(class << self; self; end).instance_eval { define_method name, &blk }
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
module FindArt
|
16
|
+
VERSION = '0.0.1'
|
17
|
+
end
|
18
|
+
|
19
|
+
require "#{File.dirname(__FILE__)}/FindArt/scraper.rb"
|
20
|
+
Dir["#{File.dirname(__FILE__)}/FindArt/scrapers/*.rb"].each {|file| require file }
|
@@ -0,0 +1,76 @@
|
|
1
|
+
|
2
|
+
# Factory that contains site specific scapers.
|
3
|
+
# Scrapers are classes that collect urls of album art for a given artist and title.
|
4
|
+
# Scraper classes can self register them self by calling register_scraper in the class body.
|
5
|
+
# == Example
|
6
|
+
#
|
7
|
+
# @scraper = Scraper.new
|
8
|
+
#
|
9
|
+
# class TestScraper < Scraper
|
10
|
+
# register_scraper :test_scraper
|
11
|
+
# end
|
12
|
+
#
|
13
|
+
# @scraper.scrapers
|
14
|
+
# {:test_scraper=>TestScraper}
|
15
|
+
module FindArt
|
16
|
+
class Scraper
|
17
|
+
@@scrapers = {}
|
18
|
+
|
19
|
+
|
20
|
+
# class method for registering scrapers
|
21
|
+
def self.register_scraper(name)
|
22
|
+
@@scrapers[name] = self
|
23
|
+
end
|
24
|
+
|
25
|
+
# hash of all registerd scrapers
|
26
|
+
def scrapers
|
27
|
+
@@scrapers
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.registerd_sites
|
31
|
+
@@scrapers.map {|scraper_name,klass| URI(klass.url).host}
|
32
|
+
end
|
33
|
+
|
34
|
+
# clears all registerd scapers
|
35
|
+
def self.unregister_scrapers!
|
36
|
+
@@scrapers = {}
|
37
|
+
end
|
38
|
+
|
39
|
+
# all registerd scrapers are used to find album art
|
40
|
+
# returns an array of urls
|
41
|
+
def find_art(artist,title,opts=nil)
|
42
|
+
threads = []
|
43
|
+
@@scrapers.each do |scraper_name,klass|
|
44
|
+
scraper = klass.new
|
45
|
+
threads << Thread.new { Thread.current["results"] = scraper.scrape(artist,title,opts)}
|
46
|
+
end
|
47
|
+
results = []
|
48
|
+
begin
|
49
|
+
threads.each {|t| t.join; results << t["results"] unless t['results'].nil? }
|
50
|
+
rescue
|
51
|
+
end
|
52
|
+
results
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
# helper method for setting the url where the scraper starts its search for album art
|
58
|
+
#
|
59
|
+
# == Example
|
60
|
+
#
|
61
|
+
# class JunoDownload < Scraper
|
62
|
+
# start_url "http://www.junodownload.com/search/"
|
63
|
+
# register_scraper :junodownload
|
64
|
+
# end
|
65
|
+
#
|
66
|
+
def self.start_url(url)
|
67
|
+
class_variable_set "@@url", url
|
68
|
+
meta_def :url do
|
69
|
+
class_variable_get "@@url"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
|
76
|
+
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module FindArt
|
2
|
+
class AlbumArtExchange < Scraper
|
3
|
+
start_url "http://www.albumartexchange.com/covers.php?sort=7&q="
|
4
|
+
register_scraper :albumartexchange
|
5
|
+
|
6
|
+
def scrape(artist,title,opts={})
|
7
|
+
url = nil
|
8
|
+
search_url = "#{@@url}#{CGI.escape("#{artist} #{title}")}"
|
9
|
+
browser = WWW::Mechanize.new
|
10
|
+
browser.get(search_url) do |page|
|
11
|
+
doc = Hpricot(page.body)
|
12
|
+
element = doc.at("* table tr td a img[@width='150']")
|
13
|
+
src = element["src"] if !element.nil? && !element["src"].nil?
|
14
|
+
if src
|
15
|
+
path = src.split("src=").last
|
16
|
+
url = "http://www.albumartexchange.com#{URI.decode(path)}"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
url
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module FindArt
|
2
|
+
class Amazon < Scraper
|
3
|
+
start_url "http://webservices.amazon.com/"
|
4
|
+
register_scraper :amazon
|
5
|
+
@@urls = [
|
6
|
+
"http://webservices.amazon.com/",
|
7
|
+
"http://webservices.amazon.co.uk/",
|
8
|
+
"http://webservices.amazon.co.jp/",
|
9
|
+
"http://webservices.amazon.ca/",
|
10
|
+
"http://webservices.amazon.fr/",
|
11
|
+
"http://webservices.amazon.de/",
|
12
|
+
]
|
13
|
+
|
14
|
+
|
15
|
+
# Uses the artist webservice to search for album covers.
|
16
|
+
# Tries all international amazon sites
|
17
|
+
def scrape(artist,title,opts={})
|
18
|
+
search_url = "onca/xml?Service=AWSECommerceService&AWSAccessKeyId=0NK019CD48HNEDK3PBG2&Operation=ItemSearch&SearchIndex=Music&ResponseGroup=Small,Images&Keywords=#{CGI.escape("#{artist} #{title}")}"
|
19
|
+
@@urls.each do |url|
|
20
|
+
art = find_best_art("#{url}#{search_url}")
|
21
|
+
return art unless art.nil?
|
22
|
+
end
|
23
|
+
nil
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
# Finds the best quality artwork availible at amazon
|
29
|
+
def find_best_art(search_url)
|
30
|
+
browser = WWW::Mechanize.new
|
31
|
+
search = browser.get(search_url)
|
32
|
+
doc = Hpricot(search.body)
|
33
|
+
|
34
|
+
large,medium,small = [],[],[]
|
35
|
+
(doc/:item).each do |item|
|
36
|
+
large << item.at("largeimage").at("url").innerHTML if item.at("largeimage")
|
37
|
+
medium << item.at("mediumimage").at("url").innerHTML if item.at("mediumimage")
|
38
|
+
small << item.at("smallimage").at("url").innerHTML if item.at("smallimage")
|
39
|
+
end
|
40
|
+
|
41
|
+
url = small.first unless small.empty?
|
42
|
+
url = medium.first unless medium.empty?
|
43
|
+
url = large.first unless large.empty?
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
# module FindArt
|
2
|
+
# #
|
3
|
+
# # class Discogs < Scraper
|
4
|
+
# # start_url "http://www.discogs.com/search?type=all&q="
|
5
|
+
# # register_scraper :discogs
|
6
|
+
# #
|
7
|
+
# # def scrape(artist,title,opts={})
|
8
|
+
# # url = nil
|
9
|
+
# # search_url = "#{@@url}#{CGI.escape("#{artist} #{title}")}"
|
10
|
+
# # browser = WWW::Mechanize.new
|
11
|
+
# # browser.get(search_url) do |page|
|
12
|
+
# # doc = Hpricot(page.body)
|
13
|
+
# #
|
14
|
+
# # # check if there are multiple results and get the top result
|
15
|
+
# # element = doc.at("* .search_result a")
|
16
|
+
# # if !element.nil?
|
17
|
+
# # # extract and fetch item page
|
18
|
+
# # item_page = browser.get(element.attributes["href"])
|
19
|
+
# # doc = Hpricot(item_page.body)
|
20
|
+
# #
|
21
|
+
# #
|
22
|
+
# #
|
23
|
+
# # end
|
24
|
+
# #
|
25
|
+
# # #extract art from product page
|
26
|
+
# # end
|
27
|
+
# # url
|
28
|
+
# # end
|
29
|
+
# #
|
30
|
+
# # # Extracts album art url from wallmart product page
|
31
|
+
# # def extract_art(doc)
|
32
|
+
# # puts doc
|
33
|
+
# # # url = nil
|
34
|
+
# # # element = doc.at("* td[@align='center'] a[href~='/viewimages']")
|
35
|
+
# # # href = element["href"] if !element.nil? && !element["href"].nil?
|
36
|
+
# # # if href
|
37
|
+
# # # match, url = *href.match(/javascript:photo_opener\('(http:\/\/.*.jpg)&/)
|
38
|
+
# # # end
|
39
|
+
# # # url
|
40
|
+
# # end
|
41
|
+
# #
|
42
|
+
# # end
|
43
|
+
#
|
44
|
+
# end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module FindArt
|
2
|
+
class JunoDownload < Scraper
|
3
|
+
start_url "http://www.junodownload.com/search/"
|
4
|
+
register_scraper :junodownload
|
5
|
+
|
6
|
+
def scrape(artist,title,opts={})
|
7
|
+
url = nil
|
8
|
+
|
9
|
+
browser = WWW::Mechanize.new
|
10
|
+
browser.get(@@url) do |page|
|
11
|
+
# go to search form
|
12
|
+
search = page.form_with(:action => 'http://www.junodownload.com/search/') do |f|
|
13
|
+
f.q = "#{artist} #{title}"
|
14
|
+
end.click_button
|
15
|
+
doc = Hpricot(search.body)
|
16
|
+
element = doc.at('.productcover img[@src^="http://cdn.images.juno.co.uk/75/"]')
|
17
|
+
src = nil
|
18
|
+
if !element.nil?
|
19
|
+
src = element.attributes['src']
|
20
|
+
# instead of cliking on the image to find the uri of the larger image we will replace some parts of the url
|
21
|
+
# lets hope junodownload keeps this convention
|
22
|
+
src.gsub!("/75/","/full/")
|
23
|
+
src.gsub!("-TN","-BIG")
|
24
|
+
url = src
|
25
|
+
end
|
26
|
+
end
|
27
|
+
url
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module FindArt
|
2
|
+
class WalMart < Scraper
|
3
|
+
start_url "http://www.walmart.com/search/search-ng.do?earch_sort=2&search_query="
|
4
|
+
register_scraper :walmart
|
5
|
+
|
6
|
+
def scrape(artist,title,opts={})
|
7
|
+
url = nil
|
8
|
+
search_url = "#{@@url}#{CGI.escape("#{artist} #{title}")}"
|
9
|
+
browser = WWW::Mechanize.new
|
10
|
+
browser.get(search_url) do |page|
|
11
|
+
doc = Hpricot(page.body)
|
12
|
+
|
13
|
+
# check if there are multiple results and get the top result
|
14
|
+
element = doc.at("* .firstRow a")
|
15
|
+
if !element.nil?
|
16
|
+
# extract and fetch item page
|
17
|
+
item_page = browser.get(element.attributes["href"])
|
18
|
+
doc = Hpricot(item_page.body)
|
19
|
+
end
|
20
|
+
|
21
|
+
#extract art from product page
|
22
|
+
url = extract_art(doc)
|
23
|
+
end
|
24
|
+
url
|
25
|
+
end
|
26
|
+
|
27
|
+
# Extracts album art url from wallmart product page
|
28
|
+
def extract_art(doc)
|
29
|
+
url = nil
|
30
|
+
element = doc.at("* div[@class='LargeItemPhoto150'] a[@href^=javascript]")
|
31
|
+
href = element["href"] if !element.nil? && !element["href"].nil?
|
32
|
+
if href
|
33
|
+
match, url = *href.match(/javascript:photo_opener\('(http:\/\/.*.jpg)&/)
|
34
|
+
end
|
35
|
+
url
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
data/script/console
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# File: script/console
|
3
|
+
irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
|
4
|
+
|
5
|
+
libs = " -r irb/completion"
|
6
|
+
# Perhaps use a console_lib to store any extra methods I may want available in the cosole
|
7
|
+
# libs << " -r #{File.dirname(__FILE__) + '/../lib/console_lib/console_logger.rb'}"
|
8
|
+
libs << " -r #{File.dirname(__FILE__) + '/../lib/FindArt.rb'}"
|
9
|
+
puts "Loading FindArt gem"
|
10
|
+
exec "#{irb} #{libs} --simple-prompt"
|
data/script/destroy
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'rubigen'
|
6
|
+
rescue LoadError
|
7
|
+
require 'rubygems'
|
8
|
+
require 'rubigen'
|
9
|
+
end
|
10
|
+
require 'rubigen/scripts/destroy'
|
11
|
+
|
12
|
+
ARGV.shift if ['--help', '-h'].include?(ARGV[0])
|
13
|
+
RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
|
14
|
+
RubiGen::Scripts::Destroy.new.run(ARGV)
|
data/script/generate
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'rubigen'
|
6
|
+
rescue LoadError
|
7
|
+
require 'rubygems'
|
8
|
+
require 'rubigen'
|
9
|
+
end
|
10
|
+
require 'rubigen/scripts/generate'
|
11
|
+
|
12
|
+
ARGV.shift if ['--help', '-h'].include?(ARGV[0])
|
13
|
+
RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
|
14
|
+
RubiGen::Scripts::Generate.new.run(ARGV)
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper.rb'
|
2
|
+
|
3
|
+
describe AlbumArtExchange do
|
4
|
+
|
5
|
+
it "Should have a start url form where the scaper starts" do
|
6
|
+
AlbumArtExchange.class_variables.should include("@@url")
|
7
|
+
AlbumArtExchange.url.should == 'http://www.albumartexchange.com/covers.php?sort=7&q='
|
8
|
+
end
|
9
|
+
|
10
|
+
it "Should find the album art url for DJ TIESTO - In Search Of Sunrise 6" do
|
11
|
+
@album = AlbumArtExchange.new()
|
12
|
+
@album.scrape("DJ TIESTO","In Search of Sunrise 6").should == "http://www.albumartexchange.com/gallery/images/public/ti/tiesto-insear_02.jpg"
|
13
|
+
end
|
14
|
+
|
15
|
+
it "Should return nil when no album art is found" do
|
16
|
+
@album = AlbumArtExchange.new()
|
17
|
+
@album.scrape("Irene Moors & De Smurfen","Ga Je Mee Naar Smurfenland").should be(nil)
|
18
|
+
end
|
19
|
+
|
20
|
+
it "Should find the album art for the following albums" do
|
21
|
+
@album = AlbumArtExchange.new()
|
22
|
+
@album.scrape("Hi_Tack","Let's Dance").should == "http://www.albumartexchange.com/gallery/images/public/hi/hi_tac-letsda.jpg"
|
23
|
+
@album.scrape("24 Grana","underpop").should == "http://www.albumartexchange.com/gallery/images/public/24/24gran-underp_02.jpg"
|
24
|
+
@album.scrape("Green Day","Dookie").should == "http://www.albumartexchange.com/gallery/images/public/gr/greend-dookie_05.jpg"
|
25
|
+
end
|
26
|
+
end
|
data/spec/amazon_spec.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# require File.dirname(__FILE__) + '/spec_helper.rb'
|
2
|
+
#
|
3
|
+
# describe Amazon do
|
4
|
+
#
|
5
|
+
# it "Should have a start url form where the scaper starts" do
|
6
|
+
# Amazon.class_variables.should include("@@url")
|
7
|
+
# Amazon.url.should == 'http://webservices.amazon.com/'
|
8
|
+
# end
|
9
|
+
#
|
10
|
+
# it "Should find the album art url for DJ TIESTO - In Search Of Sunrise 6" do
|
11
|
+
# @amazon = Amazon.new()
|
12
|
+
# @amazon.scrape("DJ TIESTO","In Search of Sunrise 6").should == "http://ecx.images-amazon.com/images/I/515ZxHVU5RL.jpg"
|
13
|
+
# end
|
14
|
+
#
|
15
|
+
# it "Should find the album art url for Caféine de Christophe Willem " do
|
16
|
+
# @amazon = Amazon.new()
|
17
|
+
# @amazon.scrape("Christophe Willem","Caféine").should == "http://ecx.images-amazon.com/images/I/61sOk67nVkL.jpg"
|
18
|
+
# end
|
19
|
+
#
|
20
|
+
# it "Should find the album art url for Miss Météores de Olivia Ruiz" do
|
21
|
+
# @amazon = Amazon.new()
|
22
|
+
# @amazon.scrape("Olivia Ruiz","Miss Météores").should == "http://ecx.images-amazon.com/images/I/51S7recnyQL.jpg"
|
23
|
+
# end
|
24
|
+
#
|
25
|
+
# it "Should find the album art url for 初音ミク ベスト impacts" do
|
26
|
+
# @amazon = Amazon.new()
|
27
|
+
# @amazon.scrape(" 初音ミク ベスト","impacts").should == "http://ecx.images-amazon.com/images/I/51P-6djocGL.jpg"
|
28
|
+
# end
|
29
|
+
#
|
30
|
+
# it "Should return nil if no album art is found " do
|
31
|
+
# @amazon = Amazon.new()
|
32
|
+
# @amazon.scrape("internets","i did it for teh lulz").should == nil
|
33
|
+
# end
|
34
|
+
# end
|