rbook 0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. data/COPYING +340 -0
  2. data/LICENSE +13 -0
  3. data/README +16 -0
  4. data/Rakefile +206 -0
  5. data/examples/titlepage.rb +14 -0
  6. data/examples/www/find_all.rb +23 -0
  7. data/examples/www/find_cover_from_amazon.rb +12 -0
  8. data/examples/www/find_url_from_rainbow.rb +12 -0
  9. data/examples/www/list.rb +13 -0
  10. data/lib/rbook/bisac.rb +175 -0
  11. data/lib/rbook/errors.rb +7 -0
  12. data/lib/rbook/isbn.rb +249 -0
  13. data/lib/rbook/onix.rb +68 -0
  14. data/lib/rbook/onix/contributor.rb +60 -0
  15. data/lib/rbook/onix/lists.rb +2 -0
  16. data/lib/rbook/onix/lists/contributor_role.rb +10 -0
  17. data/lib/rbook/onix/lists/product_form.rb +100 -0
  18. data/lib/rbook/onix/message.rb +101 -0
  19. data/lib/rbook/onix/product.rb +188 -0
  20. data/lib/rbook/onix/sales_restriction.rb +51 -0
  21. data/lib/rbook/onix/supply_detail.rb +68 -0
  22. data/lib/rbook/onix/xchar.rb +98 -0
  23. data/lib/rbook/titlepage.rb +96 -0
  24. data/lib/rbook/titlepage/TitleQueryClient.rb +62 -0
  25. data/lib/rbook/titlepage/titlepage_driver.rb +134 -0
  26. data/lib/rbook/titlepage/titlepage_utils.rb +374 -0
  27. data/lib/rbook/www.rb +172 -0
  28. data/lib/rbook/www/aau_scraper.rb +76 -0
  29. data/lib/rbook/www/amazon_uk_scraper.rb +44 -0
  30. data/lib/rbook/www/base.rb +87 -0
  31. data/lib/rbook/www/harper_au_scraper.rb +56 -0
  32. data/lib/rbook/www/harper_us_scraper.rb +55 -0
  33. data/lib/rbook/www/hha_scraper.rb +50 -0
  34. data/lib/rbook/www/macmillan_scraper.rb +62 -0
  35. data/lib/rbook/www/orbis_scraper.rb +48 -0
  36. data/lib/rbook/www/oup_scraper.rb +64 -0
  37. data/lib/rbook/www/paulist_scraper.rb +53 -0
  38. data/lib/rbook/www/pearson_au_scraper.rb +52 -0
  39. data/lib/rbook/www/penguin_scraper.rb +45 -0
  40. data/lib/rbook/www/random_au_scraper.rb +90 -0
  41. data/lib/rbook/www/random_us_scraper.rb +59 -0
  42. data/lib/rbook/www/sas_scraper.rb +54 -0
  43. data/lib/rbook/www/unireps_scraper.rb +58 -0
  44. data/lib/rbook/www/wiley_us_scraper.rb +54 -0
  45. data/test/data/abingdon.xml +38931 -0
  46. data/test/data/augsburg.xml +39009 -0
  47. data/test/data/chalice.xml +10851 -0
  48. data/test/data/eerdsman.xml +36942 -0
  49. data/test/data/invalid_no_product.xml +9 -0
  50. data/test/data/not_xml.csv +1 -0
  51. data/test/data/single_product.xml +50 -0
  52. data/test/data/xml_not_onix.xml +7 -0
  53. data/test/mocks/titlepage_driver.rb +107 -0
  54. data/test/unit/bisac_test.rb +57 -0
  55. data/test/unit/isbn_test.rb +149 -0
  56. data/test/unit/onix/contributor_test.rb +50 -0
  57. data/test/unit/onix/message_test.rb +119 -0
  58. data/test/unit/onix/product_test.rb +101 -0
  59. data/test/unit/onix/sales_restriction_test.rb +48 -0
  60. data/test/unit/onix/supply_detail_test.rb +53 -0
  61. data/test/unit/onix/xchar_test.rb +37 -0
  62. data/test/unit/titlepage_test.rb +127 -0
  63. metadata +130 -0
@@ -0,0 +1,76 @@
1
+
2
+ module RBook
3
+ module WWW
4
+
5
+ class AAUScraper < Base
6
+
7
+ SCRAPER_ID = :aau
8
+ SCRAPER_NAME = "Allen and Unwin".freeze
9
+ SCRAPER_SITE = "http://www.allenandunwin.com/".freeze
10
+
11
+ #add_publisher( self, "9781741100000", "9781741199999" )
12
+ #add_publisher( self, "9781865000000", "9781865099999" )
13
+ add_scraper( self )
14
+
15
+ def initialize
16
+ @url_protocol = "http://"
17
+ @url_host = "www.allenandunwin.com"
18
+ @url_path = "/bookseller/product.aspx?ISBN="
19
+ end
20
+
21
+ def get_info(isbn)
22
+
23
+ raise ArgumentError, 'Supplied isbn is not valid' unless ISBN::valid_isbn?(isbn)
24
+
25
+ isbn = ISBN::convert_to_isbn13(isbn) unless ISBN::valid_isbn13?(isbn)
26
+
27
+ rba = Scraper.define do
28
+ process "span#lblISBN", :isbn => :text
29
+ process "h1>span#lblBookTitle", :title => :text
30
+ process "span#lblAusRRP", :rrp => :text
31
+ process "span#lblPublisher", :publisher => :text
32
+ process "span#lblImprint", :imprint => :text
33
+ process "span#lblBinding", :form => :text
34
+ process "span#lblExtent", :pages => :text
35
+ process "span#lblPubDate", :pubdate => :text
36
+ process "span#lblDescription", :description => :text
37
+ process "span#lblAuthor_bio", :authorbio => :text
38
+ process "a#hypHiRes", :cover_large => "@href"
39
+ process "a#imgProduct", :cover_thumb => "@href"
40
+ result :isbn, :title, :rrp, :publisher, :imprint, :form, :pages, :pubdate, :description, :authorbio, :cover_thumb, :cover_large
41
+ end
42
+
43
+ content = Net::HTTP.get URI.parse(get_link(isbn))
44
+ result = rba.scrape(content)
45
+
46
+ if result.title.nil? || result.title == ""
47
+ return nil
48
+ else
49
+
50
+ info = {}
51
+ info[:isbn] = result.isbn.gsub("ISBN : ", "")
52
+ info[:title] = result.title unless result.title.nil?
53
+ info[:rrp] = result.rrp.gsub("Australian Price : ", "").gsub(/\sInc. GST\n.+/,"") unless result.rrp.nil?
54
+ info[:publisher] = result.publisher.gsub("Publisher : ", "") unless result.imprint.nil?
55
+ info[:imprint] = result.imprint.gsub("Imprint : ", "") unless result.imprint.nil?
56
+ info[:format] = result.form.gsub("Format : ", "") unless result.form.nil?
57
+ info[:pages] = result.pages.gsub("Number of pages : ", "") unless result.pages.nil?
58
+ info[:pubdate] = result.pubdate.gsub("Publication Date : ", "") unless result.pubdate.nil?
59
+ info[:description] = result.description unless result.description.nil?
60
+ info[:authorbio] = result.authorbio.gsub("About the Author :\n", "") unless result.authorbio.nil?
61
+ info[:cover_large] = @url_protocol + @url_host + result.cover_large.gsub(/^../, "") unless result.cover_large.nil?
62
+ info[:cover_thumb] = @url_protocol + @url_host + result.cover_thumb unless result.cover_thumb.nil?
63
+ info[:link] = get_link(isbn)
64
+ info[:from_name] = SCRAPER_NAME
65
+ info[:from_url] = SCRAPER_SITE
66
+ return info
67
+ end
68
+ end
69
+
70
+ def get_link(isbn)
71
+ return nil unless ISBN::valid_isbn?(isbn)
72
+ return @url_protocol + @url_host + @url_path + ISBN::convert_to_isbn10(isbn)
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,44 @@
1
+
2
+ module RBook
3
+ module WWW
4
+
5
+ class AmazonUKScraper < Base
6
+
7
+ SCRAPER_ID = :amazon_uk
8
+ SCRAPER_NAME = "Amazon UK".freeze
9
+ SCRAPER_SITE = "http://www.amazon.co.uk/".freeze
10
+
11
+ #add_retailer( self )
12
+ add_scraper( self )
13
+
14
+ def get_cover(isbn)
15
+
16
+ isbn = ISBN::convert_to_isbn13(isbn)
17
+ isbn10 = ISBN::convert_to_isbn10(isbn)
18
+
19
+ return nil if isbn.nil? || isbn10.nil?
20
+
21
+ url_prefix = "http://images.amazon.com/images/P/"
22
+ url_suffix = ".02.LZZZZZZZ.jpg"
23
+ link = url_prefix + isbn10 + url_suffix
24
+
25
+ begin
26
+ response = Net::HTTP.get_response URI.parse(link)
27
+ if response.code != "200"
28
+ return nil
29
+ elsif response.body.size <= 807
30
+ return nil
31
+ else
32
+ result = {}
33
+ result[:data] = response.body
34
+ result[:content_type] = "image/jpeg"
35
+ return result
36
+ end
37
+ rescue
38
+ return nil
39
+ end
40
+ end
41
+
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,87 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+
4
+ module RBook
5
+ module WWW
6
+
7
+ class Base
8
+
9
+ @@scrapers = []
10
+
11
+ # registers a new scraper with the library.
12
+ # classname - the class to add
13
+ def self.add_scraper(classname)
14
+ @@scrapers << classname
15
+ end
16
+
17
+ # find a scraper matching the requested id
18
+ # id - a scraper id as a symbol
19
+ def self.find_scraper(id)
20
+ @@scrapers.each do |scraper|
21
+ return scraper if scraper::SCRAPER_ID == id
22
+ end
23
+ return nil
24
+ end
25
+
26
+ # find any scrapers matching the requested ids
27
+ # ids - an array of scraper id's as symbols
28
+ def self.find_scrapers(ids)
29
+ ret = []
30
+ @@scrapers.each do |scraper|
31
+ ret << scraper if ids.contains?(scraper::SCRAPER_ID)
32
+ end
33
+ return ret
34
+ end
35
+
36
+ # This method can be overwritten in each scraper. It should return a hash containing the binary data
37
+ # and mimetype of the largest cover image it can find for the requested isbn
38
+ def get_cover(isbn)
39
+
40
+ info = get_info(isbn)
41
+ return nil if info.nil?
42
+ return nil unless info.kind_of?(Hash)
43
+
44
+ link = info[:cover_large] || info[:cover_medium] || info[:cover_thumb]
45
+
46
+ return nil if link.nil?
47
+
48
+ begin
49
+ response = Net::HTTP.get_response URI.parse(link)
50
+ if response.code != "200"
51
+ raise response.code.to_s
52
+ return nil
53
+ else
54
+ result = {}
55
+ result[:data] = response.body
56
+ result[:content_type] = "image/jpeg"
57
+ return result
58
+ end
59
+ rescue
60
+ return nil
61
+ end
62
+ end
63
+
64
+ # This method can be overwritten in each scraper. It should return a hash of any information on
65
+ # the requested isbn it can find
66
+ def get_info(isbn)
67
+ nil
68
+ end
69
+
70
+ # This method can be overwritten in each scraper. It should return a link to the requested isbn
71
+ # on the targets website
72
+ def get_link(isbn)
73
+ nil
74
+ end
75
+
76
+ # return the symbol used to uniquely identify each scraper
77
+ def scraper_id
78
+ return SCRAPER_ID
79
+ end
80
+
81
+ def self.scrapers
82
+ @@scrapers
83
+ end
84
+ end
85
+
86
+ end
87
+ end
@@ -0,0 +1,56 @@
1
+
2
+ module RBook
3
+ module WWW
4
+
5
+ class HarperCollinsAUScraper < Base
6
+
7
+ SCRAPER_ID = :harper_au
8
+ SCRAPER_NAME = "Harper Collins Australia".freeze
9
+ SCRAPER_SITE = "http://www.harpercollins.com.au/".freeze
10
+
11
+ #add_publisher( self, "9780006400000", "9780006499999" )
12
+ #add_publisher( self, "9780007100000", "9780007199999" )
13
+ add_scraper( self )
14
+
15
+ def get_info(isbn)
16
+
17
+ raise ArgumentError, 'Supplied isbn is not valid' unless ISBN::valid_isbn?(isbn)
18
+
19
+ isbn = ISBN::convert_to_isbn13(isbn) unless ISBN::valid_isbn13?(isbn)
20
+
21
+ @protocol = "http://"
22
+ @host = "www.harpercollins.com.au"
23
+ @path = "/global_scripts/product_catalog/book_xml.asp?isbn="
24
+ @link = @protocol + @host + @path + ISBN::convert_to_isbn10(isbn)
25
+
26
+ rba = Scraper.define do
27
+ process "div.header", :title => :text
28
+ process "div.subtitle", :subtitle => :text
29
+ process "div.byline", :author => :text
30
+ process "img.bookJacket", :cover => "@src"
31
+ result :title, :subtitle, :author, :cover
32
+ end
33
+
34
+ content = Net::HTTP.get URI.parse(@link)
35
+ result = rba.scrape(content)
36
+
37
+ if result.title.nil?
38
+ return nil
39
+ else
40
+
41
+ info = {}
42
+ info[:isbn] = isbn
43
+ info[:title] = result.title
44
+ info[:subtitle] = result.subtitle
45
+ info[:author] = result.author.gsub("by ","")
46
+ info[:cover] = result.cover
47
+ info[:link] = @link
48
+ info[:from_name] = SCRAPER_NAME
49
+ info[:from_url] = SCRAPER_SITE
50
+ return info
51
+ end
52
+ end
53
+
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,55 @@
1
+
2
+ module RBook
3
+ module WWW
4
+
5
+ class HarperCollinsUSScraper < Base
6
+
7
+ SCRAPER_ID = :harper_us
8
+ SCRAPER_NAME = "Harper Collins United States".freeze
9
+ SCRAPER_SITE = "http://www.harpercollins.com/".freeze
10
+
11
+ #add_publisher( self, "9780060000000", "9780060999999" )
12
+ add_scraper( self )
13
+
14
+ def get_info(isbn)
15
+ raise ArgumentError, 'Supplied isbn is not valid' unless ISBN::valid_isbn?(isbn)
16
+
17
+ isbn = ISBN::convert_to_isbn13(isbn) unless ISBN::valid_isbn13?(isbn)
18
+
19
+ @protocol = "http://"
20
+ @host = "www.harpercollins.com"
21
+ @path = "/book/index.aspx?isbn="
22
+ @link = @protocol + @host + @path + isbn
23
+
24
+ rba = Scraper.define do
25
+ process "h1.bookTitle", :title => :text
26
+ process "h2.bookSubTitle", :subtitle => :text
27
+ process "h3.byLine", :author => :text
28
+ process "img.bookJacket", :cover => "@src"
29
+ result :title, :subtitle, :author, :cover
30
+ end
31
+
32
+ content = Net::HTTP.get URI.parse(@link)
33
+ result = rba.scrape(content)
34
+
35
+ if result.title.nil?
36
+ return nil
37
+ else
38
+
39
+ info = {}
40
+ info[:isbn] = isbn
41
+ info[:title] = result.title
42
+ info[:subtitle] = result.subtitle
43
+ info[:author] = result.author.gsub("by ", "")
44
+ info[:cover_thumb] = result.cover
45
+ info[:cover_medium] = result.cover.gsub("medium", "large")
46
+ info[:link] = @link
47
+ info[:from_name] = SCRAPER_NAME
48
+ info[:from_url] = SCRAPER_SITE
49
+ return info
50
+ end
51
+ end
52
+
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,50 @@
1
+
2
+ module RBook
3
+ module WWW
4
+
5
+ class HHAScraper < Base
6
+
7
+ SCRAPER_ID = :hha
8
+ SCRAPER_NAME = "Hodder Headline Australia".freeze
9
+ SCRAPER_SITE = "http://www.hha.com.au/".freeze
10
+
11
+ #add_publisher( self, "9780340800000", "9780340899999" )
12
+ #add_publisher( self, "9780755300000", "9780755399999" )
13
+ #add_publisher( self, "9780733600000", "9780733699999" )
14
+ add_scraper( self )
15
+
16
+ def get_info(isbn)
17
+ @protocol = "http://"
18
+ @host = "www.hha.com.au"
19
+ @path = "/books/"
20
+ @suffix = ".html"
21
+ @link = @protocol + @host + @path + ISBN::convert_to_isbn10(isbn) + @suffix
22
+
23
+ rba = Scraper.define do
24
+ process "h1.fiction", :title => :text
25
+ process "p.author", :author => :text
26
+ process "p.thumb>img", :cover_thumb => "@src"
27
+ result :title, :author, :cover_thumb
28
+ end
29
+
30
+ content = Net::HTTP.get URI.parse(@link)
31
+ result = rba.scrape(content)
32
+
33
+ if result.cover_thumb.nil?
34
+ return nil
35
+ else
36
+
37
+ info = {}
38
+ info[:isbn] = isbn
39
+ info[:author] = result.author
40
+ info[:cover_thumb] = @protocol + @host + result.cover_thumb
41
+ info[:link] = @link
42
+ info[:from_name] = SCRAPER_NAME
43
+ info[:from_url] = SCRAPER_SITE
44
+ return info
45
+ end
46
+ end
47
+
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,62 @@
1
+
2
+ module RBook
3
+
4
+ module WWW
5
+
6
+ class MacmillanScraper < Base
7
+
8
+ SCRAPER_ID = :macmillan
9
+ SCRAPER_NAME = "Pan Macmillan".freeze
10
+ SCRAPER_SITE = "http://www.panmacmillan.com.au/".freeze
11
+
12
+ #add_publisher( self, "9780312900000", "9780312999999" )
13
+ #add_publisher( self, "9780330400000", "9780330499999" )
14
+ #add_publisher( self, "9781403000000", "9781405099999" )
15
+ add_scraper( self )
16
+
17
+ def initialize
18
+ @url_protocol = "http://"
19
+ @url_host = "www.panmacmillan.com.au"
20
+ @url_path = "/display_title.asp?ISBN="
21
+ @url_suffix = "&Author=Barker,%20Robin"
22
+ end
23
+
24
+ def get_info(isbn)
25
+
26
+ isbn = ISBN::convert_to_isbn13(isbn)
27
+ return nil if isbn.nil?
28
+
29
+ mac = Scraper.define do
30
+ process "div.titlecontent>div.isbn>span", :isbn => :text
31
+ process "td[width=70%]>h1", :title => :text
32
+ process "a[title=Click on image to view a larger version]>img", :cover_medium => "@src"
33
+ process "a[title=Click on image to view a larger version]", :cover_large => "@href"
34
+ result :isbn, :title, :cover_medium, :cover_large
35
+ end
36
+
37
+ content = Net::HTTP.get URI.parse(get_link(isbn))
38
+ result = mac.scrape(content)
39
+
40
+ if result.title.nil?
41
+ return nil
42
+ else
43
+
44
+ info = {}
45
+ info[:isbn] = isbn
46
+ info[:title] = result.title
47
+ info[:cover_medium] = @url_protocol + @url_host + result.cover_medium.gsub("..", "") unless result.cover_medium.nil?
48
+ info[:cover_large] = @url_protocol + @url_host + result.cover_large.gsub("..", "") unless result.cover_large.nil?
49
+ info[:link] = get_link(isbn)
50
+ info[:from_name] = SCRAPER_NAME
51
+ info[:from_url] = SCRAPER_SITE
52
+ return info
53
+ end
54
+ end
55
+
56
+ def get_link(isbn)
57
+ return nil unless ISBN::valid_isbn?(isbn)
58
+ return @url_protocol + @url_host + @url_path + ISBN::convert_to_isbn10(isbn) + @url_suffix
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,48 @@
1
+
2
+ module RBook
3
+ module WWW
4
+
5
+ class OrbisScraper < Base
6
+
7
+ SCRAPER_ID = :orbis
8
+ SCRAPER_NAME = "Orbis Books"
9
+ SCRAPER_SITE = "http://www.orbisbooks.com/"
10
+
11
+ #add_publisher( self, "978157070000", "9781570799999")
12
+ add_scraper( self )
13
+
14
+ def get_info(isbn)
15
+
16
+ @protocol = "http://"
17
+ @host = "www.maryknollmall.org"
18
+ @path = "/description.cfm?ISBN="
19
+ @grouped_isbn = ISBN::add_groups(ISBN::convert_to_isbn10(isbn))
20
+ @link = @protocol + @host + @path + @grouped_isbn
21
+
22
+ oup = Scraper.define do
23
+ process "tr>td[colspan=4]>font[size=3]", :description => :text # doesn't currently work
24
+ process "table>tr>td[rowspan=2]>img", :cover_thumb => "@src"
25
+ result :description, :cover_thumb
26
+ end
27
+
28
+ content = Net::HTTP.get URI.parse(@link)
29
+
30
+ result = oup.scrape(content)
31
+
32
+ if result.cover_thumb.nil?
33
+ return nil
34
+ else
35
+
36
+ info = {}
37
+ info[:isbn] = isbn
38
+ info[:cover_thumb] = @protocol + @host + result.cover_thumb unless result.cover_thumb.nil?
39
+ info[:link] = @link
40
+ info[:from_name] = SCRAPER_NAME
41
+ info[:from_url] = SCRAPER_SITE
42
+ return info
43
+ end
44
+ end
45
+
46
+ end
47
+ end
48
+ end