jaleb 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,68 @@
1
+ # = Jaleb
2
+ # Author:: Hasan Basheer - 2017
3
+ require "uri"
4
+ require_relative "jaleb/version"
5
+ require_relative "jaleb/data"
6
+ require_relative "jaleb/model"
7
+ require_relative "jaleb/string"
8
+ require_relative "jaleb/nokogiri"
9
+
10
+
11
+ module Jaleb
12
+ LIBRARY_PATH = File.join(File.dirname(__FILE__), 'jaleb')
13
+ MODEL_PATH = File.join(LIBRARY_PATH, 'models')
14
+
15
+ # Module Methods
16
+ class << self
17
+ # Fetch information based on url
18
+ def fetch(url)
19
+ # Look for model based on url
20
+ model_class = Jaleb::Model.identify(url)
21
+
22
+ # Use Base class for fallback
23
+ model_class = Jaleb::Model::Base unless model_class
24
+
25
+ data = Jaleb::Data.read(url)
26
+
27
+ product = model_class.new
28
+ product.parse(data)
29
+
30
+ # Save url
31
+ product.url = url
32
+
33
+ return product
34
+ end
35
+
36
+
37
+ # get array of models syms
38
+ def models
39
+ models = Array.new
40
+ Dir[File.join(File.dirname(__FILE__), "jaleb", "models", "*.rb")].each do |f|
41
+ model = File.basename(f, ".rb").to_sym
42
+ models << model unless model == :base
43
+ end
44
+ return models
45
+ end
46
+
47
+ # get array of model classes
48
+ # Jaleb.model_classes = [Amazon, Ebay, ThinkGeek]
49
+ def model_classes
50
+ models = Array.new
51
+ Dir[File.join(File.dirname(__FILE__), "jaleb", "models", "*.rb")].each do |f|
52
+ filename = File.basename(f, ".rb")
53
+ models << ["Jaleb", "Model", filename.camelize].join("::").constantize unless filename == "base"
54
+ end
55
+ return models
56
+ end
57
+ end
58
+
59
+
60
+
61
+ # Autoload Models
62
+ module Model
63
+ autoload :Base, File.join(MODEL_PATH, "base")
64
+ for model in Jaleb.models
65
+ autoload model.to_s.camelize.to_sym, File.join(MODEL_PATH, model.to_s)
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,22 @@
1
+ require "nokogiri"
2
+ require "open-uri"
3
+
4
+
5
+ module Jaleb
6
+ # this class is responsible for fetching and parsing data
7
+ class Data
8
+ # Get read url and get data object
9
+ def self.read(url )
10
+ opt= {}
11
+ opt['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36'
12
+ response = open(url, opt)
13
+ doc = ::Nokogiri::HTML(response.read)
14
+ # Save contents of URL/Remote File for debugging
15
+ response.rewind
16
+ last_response_file = File.expand_path(File.join("..", "..", "last_response"), File.dirname(__FILE__))
17
+ File.new(last_response_file, "w+").write(response.read)
18
+ return doc
19
+ end
20
+
21
+ end
22
+ end
@@ -0,0 +1,21 @@
1
+ module Jaleb
2
+ module Model
3
+ class << self
4
+ #Detect model by url
5
+ def identify(url)
6
+ if url =~ ::URI.regexp
7
+ uri = ::URI::parse(url)
8
+ host = uri.host
9
+ match_model = nil
10
+ for model in Jaleb.model_classes
11
+ match_model = model if host =~ model.regexp
12
+ end
13
+ return match_model
14
+
15
+ else
16
+ raise AgrumentError , "not a url"
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,51 @@
1
+ module Jaleb module Model
2
+ class Amazon < Jaleb::Model::Base # A regular expression for determining if a url comes from a specific service/website
3
+ def self.regexp
4
+ /amazon\.com/
5
+ end
6
+
7
+ # Parse data and look for object attributes to give to object
8
+ def parse(data)
9
+ super(data)
10
+
11
+ case doc
12
+ when Nokogiri::HTML::Document
13
+ # Get Name
14
+ self.name = doc.css("h1#title").first_string
15
+ self.name = doc.xpath("string(//title)").split(" - ").first unless self.name
16
+
17
+ # Get Description
18
+ self.description = doc.css(".productDescriptionWrapper").first_string
19
+
20
+ # Get description from meta title if not found
21
+ self.description = doc.xpath("//meta[@name='description']/@content").first_string if description.nil?
22
+
23
+ # Get Price
24
+ parse_price(doc.css("#actualPriceValue").first_string)
25
+ parse_price(doc.css("#priceblock_ourprice").first_string) unless self.price
26
+ parse_price(doc.css("#priceblock_saleprice").first_string) unless self.price
27
+ parse_price(doc.xpath("//span[contains(@id, 'price')]").first_string) unless self.price
28
+
29
+ # Get Unqualified Price
30
+ parse_price(doc.xpath("//*[contains(@id, 'unqualifiedBuyBox')]//span").first_string) unless self.price
31
+
32
+ # Get Used Price
33
+ parse_price(doc.xpath("//*[contains(@id, 'secondaryUsedAndNew')]//*[@class='price']").first_string) unless self.price
34
+
35
+
36
+ # Get Images
37
+ self.images = doc.xpath("//*[@data-action='main-image-click']//img").attribute_array
38
+ self.images = doc.xpath("//*[@id='imageBlock']//img").attribute_array unless self.images
39
+
40
+ # Get images for in-house products (kindle, etc.)
41
+ self.images = doc.xpath("//*[@id='kib-ma-container-0']//img").attribute_array if self.images.empty?
42
+
43
+ # Get images for third-party sellers
44
+ self.images = doc.xpath("//*[@id='prodImageContainer']//img").attribute_array if self.images.empty?
45
+
46
+ self.image = images.first
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,26 @@
1
+ require "hashie"
2
+ require "money"
3
+ require "monetize"
4
+
5
+ module Jaleb
6
+ module Model
7
+ class Base < ::Hashie::Mash
8
+ # Parse data and set object attributes
9
+ def parse(data)
10
+ self.doc = data # save data for if user wants to access it later
11
+
12
+ # Get page title as name
13
+ self.name = doc.css("head > title").first_string
14
+ end
15
+
16
+ # Parse a raw price string to get usable data
17
+ # model.parse_price("$5.00") # => #<Money cents:500 currency:USD>
18
+ # model.price.to_f # => 5.0
19
+ # model.price.currency.symbol # => '$'
20
+ def parse_price(raw_price = nil)
21
+ return if raw_price.nil?
22
+ self.price = ::Monetize.parse(raw_price)
23
+ end
24
+ end # Baseasd
25
+ end # Model
26
+ end # Jaleb
@@ -0,0 +1,32 @@
1
+ module Jaleb
2
+ module Model
3
+ class Ebay < Jaleb::Model::Base
4
+ # A regular expression for determining if a url comes from a specific service/website
5
+ def self.regexp
6
+ /ebay\.com/
7
+ end
8
+
9
+ # Parse data and look for object attributes to give to object
10
+ def parse(data)
11
+ super(data)
12
+
13
+ case doc
14
+ when Nokogiri::HTML::Document
15
+ # Get Name
16
+ self.name = doc.xpath("//h1[@itemprop='name']").first_string
17
+
18
+ # Get Description
19
+ # OMITTED: This is tough to get because ebay item descriptions are custom html/content created by sellers
20
+
21
+ # Get Price
22
+ raw_price = doc.xpath("//span[@itemprop='price']").first_string
23
+ parse_price(raw_price.gsub(/US/, "")) if raw_price
24
+
25
+ # Get Image
26
+ self.images = [{:src => doc.xpath("//span[@itemprop='image']/img").first_string}]
27
+ self.image = images.first
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,31 @@
1
+ module Jaleb
2
+ module Model
3
+ class Etsy < Jaleb::Model::Base
4
+ # A regular expression for determining if a url comes from a specific service/website
5
+ def self.regexp
6
+ /etsy\.com/
7
+ end
8
+
9
+ # Parse data and look for object attributes to give to object
10
+ def parse(data)
11
+ super(data)
12
+
13
+ case doc
14
+ when Nokogiri::HTML::Document
15
+ # Get Name
16
+ self.name = doc.xpath("//span[@itemprop='name']").first_string
17
+
18
+ # Get Description
19
+ self.description = doc.xpath("//div[@id='description-text']").first_string
20
+
21
+ # Get Price
22
+ parse_price(doc.css("span#listing-price").first_string)
23
+
24
+ # Get Images
25
+ self.images = doc.xpath("//div[@id='image-main']//img").attribute_array
26
+ self.image = images.first
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,40 @@
1
+ module Jaleb
2
+ module Model
3
+ class Gamecouk < Jaleb::Model::Base
4
+ # A regular expression for determining if a url comes from a specific service/website
5
+ def self.regexp
6
+ /game\.co\.uk/
7
+ end
8
+
9
+ # Parse data and look for object attributes to give to object
10
+ def parse(data)
11
+ super(data)
12
+
13
+ case doc
14
+ when Nokogiri::HTML::Document
15
+ require 'date'
16
+ # Get Name
17
+ self.name = doc.xpath("//meta[@property='og:title']/@content").first_string
18
+
19
+ # Get Description
20
+ # OMITTED: This is tough to get because ebay item descriptions are custom html/content created by sellers
21
+
22
+ self.description = doc.css('div#primary div#details.panel div.description').to_s
23
+
24
+ self.release_date = doc.css('div#primary div#details.panel p.releaseDate').first_string
25
+
26
+ self.release_date = Date.strptime(self.release_date.gsub('Released on ',''), '%d-%b-%Y') if self.release_date
27
+
28
+ # Get Price
29
+ raw_price = doc.css("span.price").first_string
30
+ Money.default_currency = Money::Currency.new("GBP")
31
+ parse_price(raw_price.gsub(/Only /, "")) if raw_price
32
+
33
+ # Get Image
34
+ self.images = [{:src => doc.xpath("//meta[@property='og:image']/@content").first_string}]
35
+ self.image = images.first
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,31 @@
1
+ module Jaleb
2
+ module Model
3
+ class Googleshopping < Jaleb::Model::Base
4
+ # A regular expression for determining if a url comes from a specific service/website
5
+ def self.regexp
6
+ /google\.com/
7
+ end
8
+
9
+ # Parse data and look for object attributes to give to object
10
+ def parse(data)
11
+ super(data)
12
+ case doc
13
+ when Nokogiri::HTML::Document
14
+ # Get Name
15
+ name = doc.css('h1#product-name span.main-title').first_string
16
+ self.name = name if name
17
+
18
+ # Get Description
19
+ self.description = doc.css("#product-description-full").first_string
20
+
21
+ # Get Price
22
+ parse_price(doc.css('#summary-prices .price').first_string)
23
+
24
+ # Get Images
25
+ self.images = doc.css('div#product-basic-info img').attribute_array
26
+ self.image = images.first
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,33 @@
1
+ module Jaleb
2
+ module Model
3
+ class Newegg < Jaleb::Model::Base
4
+ # A regular expression for determining if a url comes from a specific service/website
5
+ def self.regexp
6
+ /newegg\.com/
7
+ end
8
+
9
+ # Parse data and look for object attributes to give to object
10
+ def parse(data)
11
+ super(data)
12
+ case doc
13
+ when Nokogiri::HTML::Document
14
+ # Get Name
15
+ self.name = doc.css("#synopsis .grpDesc .wrapper h1 > span").first_string
16
+ #self.name = doc.css("#synopsis .grpDesc .wrapper > .span").first_string
17
+
18
+ # Description - Not always reliable.
19
+ self.description = doc.css(".itmDesc > p").first_string
20
+
21
+ # Get Price (currently disabled because price is displayed after page load)
22
+ #parse_price doc.css("li.price-current").attribute("content").content
23
+
24
+ # # Get Images
25
+ self.images = [{:src => doc.css('.mainSlide > img').attribute("src").content}]
26
+ self.image = images.first
27
+ end
28
+ rescue
29
+ return nil
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,32 @@
1
+ module Jaleb
2
+ module Model
3
+ class Souq < Jaleb::Model::Base
4
+ # A regular expression for determining if a url comes from a specific service/website
5
+ def self.regexp
6
+ /souq\.com/
7
+ end
8
+
9
+ # Parse data and look for object attributes to give to object
10
+ def parse(data)
11
+ super(data)
12
+
13
+ case doc
14
+ when Nokogiri::HTML::Document
15
+ # Get Name
16
+ self.name = doc.xpath("//meta[@property='og:title']/@content").first_string
17
+
18
+ # Get Description
19
+ self.description = doc.xpath("//meta[@property='og:description']/@content").first_string
20
+
21
+ # Get Price
22
+ raw_price = doc.css('h3.price').first_string
23
+ parse_price(raw_price.gsub(/,/, "")) if raw_price
24
+
25
+ # Get Images
26
+ self.images = [{:src => doc.xpath("//meta[@property='og:image']/@content").first_string}]
27
+ self.image = images.first
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,31 @@
1
+ module Jaleb
2
+ module Model
3
+ class Thinkgeek < Jaleb::Model::Base
4
+ # A regular expression for determining if a url comes from a specific service/website
5
+ def self.regexp
6
+ /thinkgeek\.com/
7
+ end
8
+
9
+ # Parse data and look for object attributes to give to object
10
+ def parse(data)
11
+ super(data)
12
+
13
+ case doc
14
+ when Nokogiri::HTML::Document
15
+ # Get Name
16
+ self.name = doc.xpath("//meta[@property='og:title']/@content").first_string
17
+
18
+ # Get Description
19
+ self.description = doc.xpath("//meta[@property='og:description']/@content").first_string
20
+
21
+ # Get Price
22
+ parse_price(doc.xpath("//form[@id='buy']/h3").first_string) rescue nil
23
+
24
+ # Get Images
25
+ self.images = [{:src => doc.xpath("//meta[@property='og:image']/@content").first_string}]
26
+ self.image = images.first
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,53 @@
1
+ require "cgi"
2
+
3
+ module Jaleb
4
+ # This contains helper methods for Nokogiri interactions
5
+ module Nokogiri
6
+ module HTML
7
+ module Document
8
+
9
+ end # Document
10
+ end # HTML
11
+
12
+ module XML
13
+ module NodeSet
14
+ # get string from first nodeset model
15
+ def first_string
16
+ node = first
17
+ case node
18
+ # xml/html element?
19
+ when ::Nokogiri::XML::Element
20
+ return node.content.sanitize
21
+ # xml/html attribute?
22
+ when ::Nokogiri::XML::Attr
23
+ return node.value.sanitize
24
+ end
25
+ end
26
+
27
+ # convert nodeset models to an array of hashes
28
+ # @doc.xpath("//img")).attribute_array # => [{:element => "img", :src => ".../someimage.png"}]
29
+ def attribute_array
30
+ a = Array.new
31
+ each do |node|
32
+ temp_hash = Hash.new
33
+ case node
34
+ when ::Nokogiri::XML::Element
35
+ temp_hash[:element] = node.name
36
+ node.attributes.each do |key, value|
37
+ case value
38
+ when ::Nokogiri::XML::Attr
39
+ temp_hash[key.to_sym] = value.value.sanitize
40
+ end
41
+ end
42
+ end
43
+ a << temp_hash
44
+ end
45
+ return a
46
+ end
47
+ end # Nodeset
48
+ end # XML
49
+ end # Nokogiri
50
+ end # Jaleb
51
+
52
+ ::Nokogiri::HTML::Document.send(:include, ::Jaleb::Nokogiri::HTML::Document)
53
+ ::Nokogiri::XML::NodeSet.send(:include, ::Jaleb::Nokogiri::XML::NodeSet)