rbook 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. data/COPYING +340 -0
  2. data/LICENSE +13 -0
  3. data/README +16 -0
  4. data/Rakefile +206 -0
  5. data/examples/titlepage.rb +14 -0
  6. data/examples/www/find_all.rb +23 -0
  7. data/examples/www/find_cover_from_amazon.rb +12 -0
  8. data/examples/www/find_url_from_rainbow.rb +12 -0
  9. data/examples/www/list.rb +13 -0
  10. data/lib/rbook/bisac.rb +175 -0
  11. data/lib/rbook/errors.rb +7 -0
  12. data/lib/rbook/isbn.rb +249 -0
  13. data/lib/rbook/onix.rb +68 -0
  14. data/lib/rbook/onix/contributor.rb +60 -0
  15. data/lib/rbook/onix/lists.rb +2 -0
  16. data/lib/rbook/onix/lists/contributor_role.rb +10 -0
  17. data/lib/rbook/onix/lists/product_form.rb +100 -0
  18. data/lib/rbook/onix/message.rb +101 -0
  19. data/lib/rbook/onix/product.rb +188 -0
  20. data/lib/rbook/onix/sales_restriction.rb +51 -0
  21. data/lib/rbook/onix/supply_detail.rb +68 -0
  22. data/lib/rbook/onix/xchar.rb +98 -0
  23. data/lib/rbook/titlepage.rb +96 -0
  24. data/lib/rbook/titlepage/TitleQueryClient.rb +62 -0
  25. data/lib/rbook/titlepage/titlepage_driver.rb +134 -0
  26. data/lib/rbook/titlepage/titlepage_utils.rb +374 -0
  27. data/lib/rbook/www.rb +172 -0
  28. data/lib/rbook/www/aau_scraper.rb +76 -0
  29. data/lib/rbook/www/amazon_uk_scraper.rb +44 -0
  30. data/lib/rbook/www/base.rb +87 -0
  31. data/lib/rbook/www/harper_au_scraper.rb +56 -0
  32. data/lib/rbook/www/harper_us_scraper.rb +55 -0
  33. data/lib/rbook/www/hha_scraper.rb +50 -0
  34. data/lib/rbook/www/macmillan_scraper.rb +62 -0
  35. data/lib/rbook/www/orbis_scraper.rb +48 -0
  36. data/lib/rbook/www/oup_scraper.rb +64 -0
  37. data/lib/rbook/www/paulist_scraper.rb +53 -0
  38. data/lib/rbook/www/pearson_au_scraper.rb +52 -0
  39. data/lib/rbook/www/penguin_scraper.rb +45 -0
  40. data/lib/rbook/www/random_au_scraper.rb +90 -0
  41. data/lib/rbook/www/random_us_scraper.rb +59 -0
  42. data/lib/rbook/www/sas_scraper.rb +54 -0
  43. data/lib/rbook/www/unireps_scraper.rb +58 -0
  44. data/lib/rbook/www/wiley_us_scraper.rb +54 -0
  45. data/test/data/abingdon.xml +38931 -0
  46. data/test/data/augsburg.xml +39009 -0
  47. data/test/data/chalice.xml +10851 -0
  48. data/test/data/eerdsman.xml +36942 -0
  49. data/test/data/invalid_no_product.xml +9 -0
  50. data/test/data/not_xml.csv +1 -0
  51. data/test/data/single_product.xml +50 -0
  52. data/test/data/xml_not_onix.xml +7 -0
  53. data/test/mocks/titlepage_driver.rb +107 -0
  54. data/test/unit/bisac_test.rb +57 -0
  55. data/test/unit/isbn_test.rb +149 -0
  56. data/test/unit/onix/contributor_test.rb +50 -0
  57. data/test/unit/onix/message_test.rb +119 -0
  58. data/test/unit/onix/product_test.rb +101 -0
  59. data/test/unit/onix/sales_restriction_test.rb +48 -0
  60. data/test/unit/onix/supply_detail_test.rb +53 -0
  61. data/test/unit/onix/xchar_test.rb +37 -0
  62. data/test/unit/titlepage_test.rb +127 -0
  63. metadata +130 -0
@@ -0,0 +1,76 @@
1
+
2
+ module RBook
3
+ module WWW
4
+
5
+ class AAUScraper < Base
6
+
7
+ SCRAPER_ID = :aau
8
+ SCRAPER_NAME = "Allen and Unwin".freeze
9
+ SCRAPER_SITE = "http://www.allenandunwin.com/".freeze
10
+
11
+ #add_publisher( self, "9781741100000", "9781741199999" )
12
+ #add_publisher( self, "9781865000000", "9781865099999" )
13
+ add_scraper( self )
14
+
15
+ def initialize
16
+ @url_protocol = "http://"
17
+ @url_host = "www.allenandunwin.com"
18
+ @url_path = "/bookseller/product.aspx?ISBN="
19
+ end
20
+
21
+ def get_info(isbn)
22
+
23
+ raise ArgumentError, 'Supplied isbn is not valid' unless ISBN::valid_isbn?(isbn)
24
+
25
+ isbn = ISBN::convert_to_isbn13(isbn) unless ISBN::valid_isbn13?(isbn)
26
+
27
+ rba = Scraper.define do
28
+ process "span#lblISBN", :isbn => :text
29
+ process "h1>span#lblBookTitle", :title => :text
30
+ process "span#lblAusRRP", :rrp => :text
31
+ process "span#lblPublisher", :publisher => :text
32
+ process "span#lblImprint", :imprint => :text
33
+ process "span#lblBinding", :form => :text
34
+ process "span#lblExtent", :pages => :text
35
+ process "span#lblPubDate", :pubdate => :text
36
+ process "span#lblDescription", :description => :text
37
+ process "span#lblAuthor_bio", :authorbio => :text
38
+ process "a#hypHiRes", :cover_large => "@href"
39
+ process "a#imgProduct", :cover_thumb => "@href"
40
+ result :isbn, :title, :rrp, :publisher, :imprint, :form, :pages, :pubdate, :description, :authorbio, :cover_thumb, :cover_large
41
+ end
42
+
43
+ content = Net::HTTP.get URI.parse(get_link(isbn))
44
+ result = rba.scrape(content)
45
+
46
+ if result.title.nil? || result.title == ""
47
+ return nil
48
+ else
49
+
50
+ info = {}
51
+ info[:isbn] = result.isbn.gsub("ISBN : ", "")
52
+ info[:title] = result.title unless result.title.nil?
53
+ info[:rrp] = result.rrp.gsub("Australian Price : ", "").gsub(/\sInc. GST\n.+/,"") unless result.rrp.nil?
54
+ info[:publisher] = result.publisher.gsub("Publisher : ", "") unless result.imprint.nil?
55
+ info[:imprint] = result.imprint.gsub("Imprint : ", "") unless result.imprint.nil?
56
+ info[:format] = result.form.gsub("Format : ", "") unless result.form.nil?
57
+ info[:pages] = result.pages.gsub("Number of pages : ", "") unless result.pages.nil?
58
+ info[:pubdate] = result.pubdate.gsub("Publication Date : ", "") unless result.pubdate.nil?
59
+ info[:description] = result.description unless result.description.nil?
60
+ info[:authorbio] = result.authorbio.gsub("About the Author :\n", "") unless result.authorbio.nil?
61
+ info[:cover_large] = @url_protocol + @url_host + result.cover_large.gsub(/^../, "") unless result.cover_large.nil?
62
+ info[:cover_thumb] = @url_protocol + @url_host + result.cover_thumb unless result.cover_thumb.nil?
63
+ info[:link] = get_link(isbn)
64
+ info[:from_name] = SCRAPER_NAME
65
+ info[:from_url] = SCRAPER_SITE
66
+ return info
67
+ end
68
+ end
69
+
70
+ def get_link(isbn)
71
+ return nil unless ISBN::valid_isbn?(isbn)
72
+ return @url_protocol + @url_host + @url_path + ISBN::convert_to_isbn10(isbn)
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,44 @@
1
+
2
+ module RBook
3
+ module WWW
4
+
5
+ class AmazonUKScraper < Base
6
+
7
+ SCRAPER_ID = :amazon_uk
8
+ SCRAPER_NAME = "Amazon UK".freeze
9
+ SCRAPER_SITE = "http://www.amazon.co.uk/".freeze
10
+
11
+ #add_retailer( self )
12
+ add_scraper( self )
13
+
14
+ def get_cover(isbn)
15
+
16
+ isbn = ISBN::convert_to_isbn13(isbn)
17
+ isbn10 = ISBN::convert_to_isbn10(isbn)
18
+
19
+ return nil if isbn.nil? || isbn10.nil?
20
+
21
+ url_prefix = "http://images.amazon.com/images/P/"
22
+ url_suffix = ".02.LZZZZZZZ.jpg"
23
+ link = url_prefix + isbn10 + url_suffix
24
+
25
+ begin
26
+ response = Net::HTTP.get_response URI.parse(link)
27
+ if response.code != "200"
28
+ return nil
29
+ elsif response.body.size <= 807
30
+ return nil
31
+ else
32
+ result = {}
33
+ result[:data] = response.body
34
+ result[:content_type] = "image/jpeg"
35
+ return result
36
+ end
37
+ rescue
38
+ return nil
39
+ end
40
+ end
41
+
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,87 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+
4
+ module RBook
5
+ module WWW
6
+
7
+ class Base
8
+
9
+ @@scrapers = []
10
+
11
+ # registers a new scraper with the library.
12
+ # classname - the class to add
13
+ def self.add_scraper(classname)
14
+ @@scrapers << classname
15
+ end
16
+
17
+ # find a scraper matching the requested id
18
+ # id - a scraper id as a symbol
19
+ def self.find_scraper(id)
20
+ @@scrapers.each do |scraper|
21
+ return scraper if scraper::SCRAPER_ID == id
22
+ end
23
+ return nil
24
+ end
25
+
26
+ # find any scrapers matching the requested ids
27
+ # ids - an array of scraper id's as symbols
28
+ def self.find_scrapers(ids)
29
+ ret = []
30
+ @@scrapers.each do |scraper|
31
+ ret << scraper if ids.contains?(scraper::SCRAPER_ID)
32
+ end
33
+ return ret
34
+ end
35
+
36
+ # This method can be overwritten in each scraper. It should return a hash containing the binary data
37
+ # and mimetype of the largest cover image it can find for the requested isbn
38
+ def get_cover(isbn)
39
+
40
+ info = get_info(isbn)
41
+ return nil if info.nil?
42
+ return nil unless info.kind_of?(Hash)
43
+
44
+ link = info[:cover_large] || info[:cover_medium] || info[:cover_thumb]
45
+
46
+ return nil if link.nil?
47
+
48
+ begin
49
+ response = Net::HTTP.get_response URI.parse(link)
50
+ if response.code != "200"
51
+ raise response.code.to_s
52
+ return nil
53
+ else
54
+ result = {}
55
+ result[:data] = response.body
56
+ result[:content_type] = "image/jpeg"
57
+ return result
58
+ end
59
+ rescue
60
+ return nil
61
+ end
62
+ end
63
+
64
+ # This method can be overwritten in each scraper. It should return a hash of any information on
65
+ # the requested isbn it can find
66
+ def get_info(isbn)
67
+ nil
68
+ end
69
+
70
+ # This method can be overwritten in each scraper. It should return a link to the requested isbn
71
+ # on the targets website
72
+ def get_link(isbn)
73
+ nil
74
+ end
75
+
76
+ # return the symbol used to uniquely identify each scraper
77
+ def scraper_id
78
+ return SCRAPER_ID
79
+ end
80
+
81
+ def self.scrapers
82
+ @@scrapers
83
+ end
84
+ end
85
+
86
+ end
87
+ end
@@ -0,0 +1,56 @@
1
+
2
+ module RBook
3
+ module WWW
4
+
5
+ class HarperCollinsAUScraper < Base
6
+
7
+ SCRAPER_ID = :harper_au
8
+ SCRAPER_NAME = "Harper Collins Australia".freeze
9
+ SCRAPER_SITE = "http://www.harpercollins.com.au/".freeze
10
+
11
+ #add_publisher( self, "9780006400000", "9780006499999" )
12
+ #add_publisher( self, "9780007100000", "9780007199999" )
13
+ add_scraper( self )
14
+
15
+ def get_info(isbn)
16
+
17
+ raise ArgumentError, 'Supplied isbn is not valid' unless ISBN::valid_isbn?(isbn)
18
+
19
+ isbn = ISBN::convert_to_isbn13(isbn) unless ISBN::valid_isbn13?(isbn)
20
+
21
+ @protocol = "http://"
22
+ @host = "www.harpercollins.com.au"
23
+ @path = "/global_scripts/product_catalog/book_xml.asp?isbn="
24
+ @link = @protocol + @host + @path + ISBN::convert_to_isbn10(isbn)
25
+
26
+ rba = Scraper.define do
27
+ process "div.header", :title => :text
28
+ process "div.subtitle", :subtitle => :text
29
+ process "div.byline", :author => :text
30
+ process "img.bookJacket", :cover => "@src"
31
+ result :title, :subtitle, :author, :cover
32
+ end
33
+
34
+ content = Net::HTTP.get URI.parse(@link)
35
+ result = rba.scrape(content)
36
+
37
+ if result.title.nil?
38
+ return nil
39
+ else
40
+
41
+ info = {}
42
+ info[:isbn] = isbn
43
+ info[:title] = result.title
44
+ info[:subtitle] = result.subtitle
45
+ info[:author] = result.author.gsub("by ","")
46
+ info[:cover] = result.cover
47
+ info[:link] = @link
48
+ info[:from_name] = SCRAPER_NAME
49
+ info[:from_url] = SCRAPER_SITE
50
+ return info
51
+ end
52
+ end
53
+
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,55 @@
1
+
2
+ module RBook
3
+ module WWW
4
+
5
+ class HarperCollinsUSScraper < Base
6
+
7
+ SCRAPER_ID = :harper_us
8
+ SCRAPER_NAME = "Harper Collins United States".freeze
9
+ SCRAPER_SITE = "http://www.harpercollins.com/".freeze
10
+
11
+ #add_publisher( self, "9780060000000", "9780060999999" )
12
+ add_scraper( self )
13
+
14
+ def get_info(isbn)
15
+ raise ArgumentError, 'Supplied isbn is not valid' unless ISBN::valid_isbn?(isbn)
16
+
17
+ isbn = ISBN::convert_to_isbn13(isbn) unless ISBN::valid_isbn13?(isbn)
18
+
19
+ @protocol = "http://"
20
+ @host = "www.harpercollins.com"
21
+ @path = "/book/index.aspx?isbn="
22
+ @link = @protocol + @host + @path + isbn
23
+
24
+ rba = Scraper.define do
25
+ process "h1.bookTitle", :title => :text
26
+ process "h2.bookSubTitle", :subtitle => :text
27
+ process "h3.byLine", :author => :text
28
+ process "img.bookJacket", :cover => "@src"
29
+ result :title, :subtitle, :author, :cover
30
+ end
31
+
32
+ content = Net::HTTP.get URI.parse(@link)
33
+ result = rba.scrape(content)
34
+
35
+ if result.title.nil?
36
+ return nil
37
+ else
38
+
39
+ info = {}
40
+ info[:isbn] = isbn
41
+ info[:title] = result.title
42
+ info[:subtitle] = result.subtitle
43
+ info[:author] = result.author.gsub("by ", "")
44
+ info[:cover_thumb] = result.cover
45
+ info[:cover_medium] = result.cover.gsub("medium", "large")
46
+ info[:link] = @link
47
+ info[:from_name] = SCRAPER_NAME
48
+ info[:from_url] = SCRAPER_SITE
49
+ return info
50
+ end
51
+ end
52
+
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,50 @@
1
+
2
+ module RBook
3
+ module WWW
4
+
5
+ class HHAScraper < Base
6
+
7
+ SCRAPER_ID = :hha
8
+ SCRAPER_NAME = "Hodder Headline Australia".freeze
9
+ SCRAPER_SITE = "http://www.hha.com.au/".freeze
10
+
11
+ #add_publisher( self, "9780340800000", "9780340899999" )
12
+ #add_publisher( self, "9780755300000", "9780755399999" )
13
+ #add_publisher( self, "9780733600000", "9780733699999" )
14
+ add_scraper( self )
15
+
16
+ def get_info(isbn)
17
+ @protocol = "http://"
18
+ @host = "www.hha.com.au"
19
+ @path = "/books/"
20
+ @suffix = ".html"
21
+ @link = @protocol + @host + @path + ISBN::convert_to_isbn10(isbn) + @suffix
22
+
23
+ rba = Scraper.define do
24
+ process "h1.fiction", :title => :text
25
+ process "p.author", :author => :text
26
+ process "p.thumb>img", :cover_thumb => "@src"
27
+ result :title, :author, :cover_thumb
28
+ end
29
+
30
+ content = Net::HTTP.get URI.parse(@link)
31
+ result = rba.scrape(content)
32
+
33
+ if result.cover_thumb.nil?
34
+ return nil
35
+ else
36
+
37
+ info = {}
38
+ info[:isbn] = isbn
39
+ info[:author] = result.author
40
+ info[:cover_thumb] = @protocol + @host + result.cover_thumb
41
+ info[:link] = @link
42
+ info[:from_name] = SCRAPER_NAME
43
+ info[:from_url] = SCRAPER_SITE
44
+ return info
45
+ end
46
+ end
47
+
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,62 @@
1
+
2
+ module RBook
3
+
4
+ module WWW
5
+
6
+ class MacmillanScraper < Base
7
+
8
+ SCRAPER_ID = :macmillan
9
+ SCRAPER_NAME = "Pan Macmillan".freeze
10
+ SCRAPER_SITE = "http://www.panmacmillan.com.au/".freeze
11
+
12
+ #add_publisher( self, "9780312900000", "9780312999999" )
13
+ #add_publisher( self, "9780330400000", "9780330499999" )
14
+ #add_publisher( self, "9781403000000", "9781405099999" )
15
+ add_scraper( self )
16
+
17
+ def initialize
18
+ @url_protocol = "http://"
19
+ @url_host = "www.panmacmillan.com.au"
20
+ @url_path = "/display_title.asp?ISBN="
21
+ @url_suffix = "&Author=Barker,%20Robin"
22
+ end
23
+
24
+ def get_info(isbn)
25
+
26
+ isbn = ISBN::convert_to_isbn13(isbn)
27
+ return nil if isbn.nil?
28
+
29
+ mac = Scraper.define do
30
+ process "div.titlecontent>div.isbn>span", :isbn => :text
31
+ process "td[width=70%]>h1", :title => :text
32
+ process "a[title=Click on image to view a larger version]>img", :cover_medium => "@src"
33
+ process "a[title=Click on image to view a larger version]", :cover_large => "@href"
34
+ result :isbn, :title, :cover_medium, :cover_large
35
+ end
36
+
37
+ content = Net::HTTP.get URI.parse(get_link(isbn))
38
+ result = mac.scrape(content)
39
+
40
+ if result.title.nil?
41
+ return nil
42
+ else
43
+
44
+ info = {}
45
+ info[:isbn] = isbn
46
+ info[:title] = result.title
47
+ info[:cover_medium] = @url_protocol + @url_host + result.cover_medium.gsub("..", "") unless result.cover_medium.nil?
48
+ info[:cover_large] = @url_protocol + @url_host + result.cover_large.gsub("..", "") unless result.cover_large.nil?
49
+ info[:link] = get_link(isbn)
50
+ info[:from_name] = SCRAPER_NAME
51
+ info[:from_url] = SCRAPER_SITE
52
+ return info
53
+ end
54
+ end
55
+
56
+ def get_link(isbn)
57
+ return nil unless ISBN::valid_isbn?(isbn)
58
+ return @url_protocol + @url_host + @url_path + ISBN::convert_to_isbn10(isbn) + @url_suffix
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,48 @@
1
+
2
+ module RBook
3
+ module WWW
4
+
5
+ class OrbisScraper < Base
6
+
7
+ SCRAPER_ID = :orbis
8
+ SCRAPER_NAME = "Orbis Books"
9
+ SCRAPER_SITE = "http://www.orbisbooks.com/"
10
+
11
+ #add_publisher( self, "978157070000", "9781570799999")
12
+ add_scraper( self )
13
+
14
+ def get_info(isbn)
15
+
16
+ @protocol = "http://"
17
+ @host = "www.maryknollmall.org"
18
+ @path = "/description.cfm?ISBN="
19
+ @grouped_isbn = ISBN::add_groups(ISBN::convert_to_isbn10(isbn))
20
+ @link = @protocol + @host + @path + @grouped_isbn
21
+
22
+ oup = Scraper.define do
23
+ process "tr>td[colspan=4]>font[size=3]", :description => :text # doesn't currently work
24
+ process "table>tr>td[rowspan=2]>img", :cover_thumb => "@src"
25
+ result :description, :cover_thumb
26
+ end
27
+
28
+ content = Net::HTTP.get URI.parse(@link)
29
+
30
+ result = oup.scrape(content)
31
+
32
+ if result.cover_thumb.nil?
33
+ return nil
34
+ else
35
+
36
+ info = {}
37
+ info[:isbn] = isbn
38
+ info[:cover_thumb] = @protocol + @host + result.cover_thumb unless result.cover_thumb.nil?
39
+ info[:link] = @link
40
+ info[:from_name] = SCRAPER_NAME
41
+ info[:from_url] = SCRAPER_SITE
42
+ return info
43
+ end
44
+ end
45
+
46
+ end
47
+ end
48
+ end