jaleb 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +12 -0
- data/.rspec +2 -0
- data/.travis.yml +5 -0
- data/Gemfile +19 -0
- data/LICENSE.txt +21 -0
- data/README.md +91 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/jaleb.gemspec +29 -0
- data/last_response +5615 -0
- data/lib/jaleb.rb +68 -0
- data/lib/jaleb/data.rb +22 -0
- data/lib/jaleb/model.rb +21 -0
- data/lib/jaleb/models/amazon.rb +51 -0
- data/lib/jaleb/models/base.rb +26 -0
- data/lib/jaleb/models/ebay.rb +32 -0
- data/lib/jaleb/models/etsy.rb +31 -0
- data/lib/jaleb/models/gamecouk.rb +40 -0
- data/lib/jaleb/models/googleshopping.rb +31 -0
- data/lib/jaleb/models/newegg.rb +33 -0
- data/lib/jaleb/models/souq.rb +32 -0
- data/lib/jaleb/models/thinkgeek.rb +31 -0
- data/lib/jaleb/nokogiri.rb +53 -0
- data/lib/jaleb/string.rb +33 -0
- data/lib/jaleb/version.rb +3 -0
- metadata +113 -0
data/lib/jaleb.rb
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
# = Jaleb
|
2
|
+
# Author:: Hasan Basheer - 2017
|
3
|
+
require "uri"
|
4
|
+
require_relative "jaleb/version"
|
5
|
+
require_relative "jaleb/data"
|
6
|
+
require_relative "jaleb/model"
|
7
|
+
require_relative "jaleb/string"
|
8
|
+
require_relative "jaleb/nokogiri"
|
9
|
+
|
10
|
+
|
11
|
+
module Jaleb
|
12
|
+
LIBRARY_PATH = File.join(File.dirname(__FILE__), 'jaleb')
|
13
|
+
MODEL_PATH = File.join(LIBRARY_PATH, 'models')
|
14
|
+
|
15
|
+
# Module Methods
|
16
|
+
class << self
|
17
|
+
# Fetch information based on url
|
18
|
+
def fetch(url)
|
19
|
+
# Look for model based on url
|
20
|
+
model_class = Jaleb::Model.identify(url)
|
21
|
+
|
22
|
+
# Use Base class for fallback
|
23
|
+
model_class = Jaleb::Model::Base unless model_class
|
24
|
+
|
25
|
+
data = Jaleb::Data.read(url)
|
26
|
+
|
27
|
+
product = model_class.new
|
28
|
+
product.parse(data)
|
29
|
+
|
30
|
+
# Save url
|
31
|
+
product.url = url
|
32
|
+
|
33
|
+
return product
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
# get array of models syms
|
38
|
+
def models
|
39
|
+
models = Array.new
|
40
|
+
Dir[File.join(File.dirname(__FILE__), "jaleb", "models", "*.rb")].each do |f|
|
41
|
+
model = File.basename(f, ".rb").to_sym
|
42
|
+
models << model unless model == :base
|
43
|
+
end
|
44
|
+
return models
|
45
|
+
end
|
46
|
+
|
47
|
+
# get array of model classes
|
48
|
+
# Jaleb.model_classes = [Amazon, Ebay, ThinkGeek]
|
49
|
+
def model_classes
|
50
|
+
models = Array.new
|
51
|
+
Dir[File.join(File.dirname(__FILE__), "jaleb", "models", "*.rb")].each do |f|
|
52
|
+
filename = File.basename(f, ".rb")
|
53
|
+
models << ["Jaleb", "Model", filename.camelize].join("::").constantize unless filename == "base"
|
54
|
+
end
|
55
|
+
return models
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
|
61
|
+
# Autoload Models
|
62
|
+
module Model
|
63
|
+
autoload :Base, File.join(MODEL_PATH, "base")
|
64
|
+
for model in Jaleb.models
|
65
|
+
autoload model.to_s.camelize.to_sym, File.join(MODEL_PATH, model.to_s)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
data/lib/jaleb/data.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
require "nokogiri"
|
2
|
+
require "open-uri"
|
3
|
+
|
4
|
+
|
5
|
+
module Jaleb
|
6
|
+
# this class is responsible for fetching and parsing data
|
7
|
+
class Data
|
8
|
+
# Get read url and get data object
|
9
|
+
def self.read(url )
|
10
|
+
opt= {}
|
11
|
+
opt['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36'
|
12
|
+
response = open(url, opt)
|
13
|
+
doc = ::Nokogiri::HTML(response.read)
|
14
|
+
# Save contents of URL/Remote File for debugging
|
15
|
+
response.rewind
|
16
|
+
last_response_file = File.expand_path(File.join("..", "..", "last_response"), File.dirname(__FILE__))
|
17
|
+
File.new(last_response_file, "w+").write(response.read)
|
18
|
+
return doc
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
data/lib/jaleb/model.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
module Jaleb
|
2
|
+
module Model
|
3
|
+
class << self
|
4
|
+
#Detect model by url
|
5
|
+
def identify(url)
|
6
|
+
if url =~ ::URI.regexp
|
7
|
+
uri = ::URI::parse(url)
|
8
|
+
host = uri.host
|
9
|
+
match_model = nil
|
10
|
+
for model in Jaleb.model_classes
|
11
|
+
match_model = model if host =~ model.regexp
|
12
|
+
end
|
13
|
+
return match_model
|
14
|
+
|
15
|
+
else
|
16
|
+
raise AgrumentError , "not a url"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module Jaleb module Model
|
2
|
+
class Amazon < Jaleb::Model::Base # A regular expression for determining if a url comes from a specific service/website
|
3
|
+
def self.regexp
|
4
|
+
/amazon\.com/
|
5
|
+
end
|
6
|
+
|
7
|
+
# Parse data and look for object attributes to give to object
|
8
|
+
def parse(data)
|
9
|
+
super(data)
|
10
|
+
|
11
|
+
case doc
|
12
|
+
when Nokogiri::HTML::Document
|
13
|
+
# Get Name
|
14
|
+
self.name = doc.css("h1#title").first_string
|
15
|
+
self.name = doc.xpath("string(//title)").split(" - ").first unless self.name
|
16
|
+
|
17
|
+
# Get Description
|
18
|
+
self.description = doc.css(".productDescriptionWrapper").first_string
|
19
|
+
|
20
|
+
# Get description from meta title if not found
|
21
|
+
self.description = doc.xpath("//meta[@name='description']/@content").first_string if description.nil?
|
22
|
+
|
23
|
+
# Get Price
|
24
|
+
parse_price(doc.css("#actualPriceValue").first_string)
|
25
|
+
parse_price(doc.css("#priceblock_ourprice").first_string) unless self.price
|
26
|
+
parse_price(doc.css("#priceblock_saleprice").first_string) unless self.price
|
27
|
+
parse_price(doc.xpath("//span[contains(@id, 'price')]").first_string) unless self.price
|
28
|
+
|
29
|
+
# Get Unqualified Price
|
30
|
+
parse_price(doc.xpath("//*[contains(@id, 'unqualifiedBuyBox')]//span").first_string) unless self.price
|
31
|
+
|
32
|
+
# Get Used Price
|
33
|
+
parse_price(doc.xpath("//*[contains(@id, 'secondaryUsedAndNew')]//*[@class='price']").first_string) unless self.price
|
34
|
+
|
35
|
+
|
36
|
+
# Get Images
|
37
|
+
self.images = doc.xpath("//*[@data-action='main-image-click']//img").attribute_array
|
38
|
+
self.images = doc.xpath("//*[@id='imageBlock']//img").attribute_array unless self.images
|
39
|
+
|
40
|
+
# Get images for in-house products (kindle, etc.)
|
41
|
+
self.images = doc.xpath("//*[@id='kib-ma-container-0']//img").attribute_array if self.images.empty?
|
42
|
+
|
43
|
+
# Get images for third-party sellers
|
44
|
+
self.images = doc.xpath("//*[@id='prodImageContainer']//img").attribute_array if self.images.empty?
|
45
|
+
|
46
|
+
self.image = images.first
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require "hashie"
|
2
|
+
require "money"
|
3
|
+
require "monetize"
|
4
|
+
|
5
|
+
module Jaleb
|
6
|
+
module Model
|
7
|
+
class Base < ::Hashie::Mash
|
8
|
+
# Parse data and set object attributes
|
9
|
+
def parse(data)
|
10
|
+
self.doc = data # save data for if user wants to access it later
|
11
|
+
|
12
|
+
# Get page title as name
|
13
|
+
self.name = doc.css("head > title").first_string
|
14
|
+
end
|
15
|
+
|
16
|
+
# Parse a raw price string to get usable data
|
17
|
+
# model.parse_price("$5.00") # => #<Money cents:500 currency:USD>
|
18
|
+
# model.price.to_f # => 5.0
|
19
|
+
# model.price.currency.symbol # => '$'
|
20
|
+
def parse_price(raw_price = nil)
|
21
|
+
return if raw_price.nil?
|
22
|
+
self.price = ::Monetize.parse(raw_price)
|
23
|
+
end
|
24
|
+
end # Baseasd
|
25
|
+
end # Model
|
26
|
+
end # Jaleb
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Jaleb
|
2
|
+
module Model
|
3
|
+
class Ebay < Jaleb::Model::Base
|
4
|
+
# A regular expression for determining if a url comes from a specific service/website
|
5
|
+
def self.regexp
|
6
|
+
/ebay\.com/
|
7
|
+
end
|
8
|
+
|
9
|
+
# Parse data and look for object attributes to give to object
|
10
|
+
def parse(data)
|
11
|
+
super(data)
|
12
|
+
|
13
|
+
case doc
|
14
|
+
when Nokogiri::HTML::Document
|
15
|
+
# Get Name
|
16
|
+
self.name = doc.xpath("//h1[@itemprop='name']").first_string
|
17
|
+
|
18
|
+
# Get Description
|
19
|
+
# OMITTED: This is tough to get because ebay item descriptions are custom html/content created by sellers
|
20
|
+
|
21
|
+
# Get Price
|
22
|
+
raw_price = doc.xpath("//span[@itemprop='price']").first_string
|
23
|
+
parse_price(raw_price.gsub(/US/, "")) if raw_price
|
24
|
+
|
25
|
+
# Get Image
|
26
|
+
self.images = [{:src => doc.xpath("//span[@itemprop='image']/img").first_string}]
|
27
|
+
self.image = images.first
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Jaleb
|
2
|
+
module Model
|
3
|
+
class Etsy < Jaleb::Model::Base
|
4
|
+
# A regular expression for determining if a url comes from a specific service/website
|
5
|
+
def self.regexp
|
6
|
+
/etsy\.com/
|
7
|
+
end
|
8
|
+
|
9
|
+
# Parse data and look for object attributes to give to object
|
10
|
+
def parse(data)
|
11
|
+
super(data)
|
12
|
+
|
13
|
+
case doc
|
14
|
+
when Nokogiri::HTML::Document
|
15
|
+
# Get Name
|
16
|
+
self.name = doc.xpath("//span[@itemprop='name']").first_string
|
17
|
+
|
18
|
+
# Get Description
|
19
|
+
self.description = doc.xpath("//div[@id='description-text']").first_string
|
20
|
+
|
21
|
+
# Get Price
|
22
|
+
parse_price(doc.css("span#listing-price").first_string)
|
23
|
+
|
24
|
+
# Get Images
|
25
|
+
self.images = doc.xpath("//div[@id='image-main']//img").attribute_array
|
26
|
+
self.image = images.first
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Jaleb
|
2
|
+
module Model
|
3
|
+
class Gamecouk < Jaleb::Model::Base
|
4
|
+
# A regular expression for determining if a url comes from a specific service/website
|
5
|
+
def self.regexp
|
6
|
+
/game\.co\.uk/
|
7
|
+
end
|
8
|
+
|
9
|
+
# Parse data and look for object attributes to give to object
|
10
|
+
def parse(data)
|
11
|
+
super(data)
|
12
|
+
|
13
|
+
case doc
|
14
|
+
when Nokogiri::HTML::Document
|
15
|
+
require 'date'
|
16
|
+
# Get Name
|
17
|
+
self.name = doc.xpath("//meta[@property='og:title']/@content").first_string
|
18
|
+
|
19
|
+
# Get Description
|
20
|
+
# OMITTED: This is tough to get because ebay item descriptions are custom html/content created by sellers
|
21
|
+
|
22
|
+
self.description = doc.css('div#primary div#details.panel div.description').to_s
|
23
|
+
|
24
|
+
self.release_date = doc.css('div#primary div#details.panel p.releaseDate').first_string
|
25
|
+
|
26
|
+
self.release_date = Date.strptime(self.release_date.gsub('Released on ',''), '%d-%b-%Y') if self.release_date
|
27
|
+
|
28
|
+
# Get Price
|
29
|
+
raw_price = doc.css("span.price").first_string
|
30
|
+
Money.default_currency = Money::Currency.new("GBP")
|
31
|
+
parse_price(raw_price.gsub(/Only /, "")) if raw_price
|
32
|
+
|
33
|
+
# Get Image
|
34
|
+
self.images = [{:src => doc.xpath("//meta[@property='og:image']/@content").first_string}]
|
35
|
+
self.image = images.first
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Jaleb
|
2
|
+
module Model
|
3
|
+
class Googleshopping < Jaleb::Model::Base
|
4
|
+
# A regular expression for determining if a url comes from a specific service/website
|
5
|
+
def self.regexp
|
6
|
+
/google\.com/
|
7
|
+
end
|
8
|
+
|
9
|
+
# Parse data and look for object attributes to give to object
|
10
|
+
def parse(data)
|
11
|
+
super(data)
|
12
|
+
case doc
|
13
|
+
when Nokogiri::HTML::Document
|
14
|
+
# Get Name
|
15
|
+
name = doc.css('h1#product-name span.main-title').first_string
|
16
|
+
self.name = name if name
|
17
|
+
|
18
|
+
# Get Description
|
19
|
+
self.description = doc.css("#product-description-full").first_string
|
20
|
+
|
21
|
+
# Get Price
|
22
|
+
parse_price(doc.css('#summary-prices .price').first_string)
|
23
|
+
|
24
|
+
# Get Images
|
25
|
+
self.images = doc.css('div#product-basic-info img').attribute_array
|
26
|
+
self.image = images.first
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Jaleb
|
2
|
+
module Model
|
3
|
+
class Newegg < Jaleb::Model::Base
|
4
|
+
# A regular expression for determining if a url comes from a specific service/website
|
5
|
+
def self.regexp
|
6
|
+
/newegg\.com/
|
7
|
+
end
|
8
|
+
|
9
|
+
# Parse data and look for object attributes to give to object
|
10
|
+
def parse(data)
|
11
|
+
super(data)
|
12
|
+
case doc
|
13
|
+
when Nokogiri::HTML::Document
|
14
|
+
# Get Name
|
15
|
+
self.name = doc.css("#synopsis .grpDesc .wrapper h1 > span").first_string
|
16
|
+
#self.name = doc.css("#synopsis .grpDesc .wrapper > .span").first_string
|
17
|
+
|
18
|
+
# Description - Not always reliable.
|
19
|
+
self.description = doc.css(".itmDesc > p").first_string
|
20
|
+
|
21
|
+
# Get Price (currently disabled because price is displayed after page load)
|
22
|
+
#parse_price doc.css("li.price-current").attribute("content").content
|
23
|
+
|
24
|
+
# # Get Images
|
25
|
+
self.images = [{:src => doc.css('.mainSlide > img').attribute("src").content}]
|
26
|
+
self.image = images.first
|
27
|
+
end
|
28
|
+
rescue
|
29
|
+
return nil
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Jaleb
|
2
|
+
module Model
|
3
|
+
class Souq < Jaleb::Model::Base
|
4
|
+
# A regular expression for determining if a url comes from a specific service/website
|
5
|
+
def self.regexp
|
6
|
+
/souq\.com/
|
7
|
+
end
|
8
|
+
|
9
|
+
# Parse data and look for object attributes to give to object
|
10
|
+
def parse(data)
|
11
|
+
super(data)
|
12
|
+
|
13
|
+
case doc
|
14
|
+
when Nokogiri::HTML::Document
|
15
|
+
# Get Name
|
16
|
+
self.name = doc.xpath("//meta[@property='og:title']/@content").first_string
|
17
|
+
|
18
|
+
# Get Description
|
19
|
+
self.description = doc.xpath("//meta[@property='og:description']/@content").first_string
|
20
|
+
|
21
|
+
# Get Price
|
22
|
+
raw_price = doc.css('h3.price').first_string
|
23
|
+
parse_price(raw_price.gsub(/,/, "")) if raw_price
|
24
|
+
|
25
|
+
# Get Images
|
26
|
+
self.images = [{:src => doc.xpath("//meta[@property='og:image']/@content").first_string}]
|
27
|
+
self.image = images.first
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Jaleb
|
2
|
+
module Model
|
3
|
+
class Thinkgeek < Jaleb::Model::Base
|
4
|
+
# A regular expression for determining if a url comes from a specific service/website
|
5
|
+
def self.regexp
|
6
|
+
/thinkgeek\.com/
|
7
|
+
end
|
8
|
+
|
9
|
+
# Parse data and look for object attributes to give to object
|
10
|
+
def parse(data)
|
11
|
+
super(data)
|
12
|
+
|
13
|
+
case doc
|
14
|
+
when Nokogiri::HTML::Document
|
15
|
+
# Get Name
|
16
|
+
self.name = doc.xpath("//meta[@property='og:title']/@content").first_string
|
17
|
+
|
18
|
+
# Get Description
|
19
|
+
self.description = doc.xpath("//meta[@property='og:description']/@content").first_string
|
20
|
+
|
21
|
+
# Get Price
|
22
|
+
parse_price(doc.xpath("//form[@id='buy']/h3").first_string) rescue nil
|
23
|
+
|
24
|
+
# Get Images
|
25
|
+
self.images = [{:src => doc.xpath("//meta[@property='og:image']/@content").first_string}]
|
26
|
+
self.image = images.first
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require "cgi"
|
2
|
+
|
3
|
+
module Jaleb
|
4
|
+
# This contains helper methods for Nokogiri interactions
|
5
|
+
module Nokogiri
|
6
|
+
module HTML
|
7
|
+
module Document
|
8
|
+
|
9
|
+
end # Document
|
10
|
+
end # HTML
|
11
|
+
|
12
|
+
module XML
|
13
|
+
module NodeSet
|
14
|
+
# get string from first nodeset model
|
15
|
+
def first_string
|
16
|
+
node = first
|
17
|
+
case node
|
18
|
+
# xml/html element?
|
19
|
+
when ::Nokogiri::XML::Element
|
20
|
+
return node.content.sanitize
|
21
|
+
# xml/html attribute?
|
22
|
+
when ::Nokogiri::XML::Attr
|
23
|
+
return node.value.sanitize
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# convert nodeset models to an array of hashes
|
28
|
+
# @doc.xpath("//img")).attribute_array # => [{:element => "img", :src => ".../someimage.png"}]
|
29
|
+
def attribute_array
|
30
|
+
a = Array.new
|
31
|
+
each do |node|
|
32
|
+
temp_hash = Hash.new
|
33
|
+
case node
|
34
|
+
when ::Nokogiri::XML::Element
|
35
|
+
temp_hash[:element] = node.name
|
36
|
+
node.attributes.each do |key, value|
|
37
|
+
case value
|
38
|
+
when ::Nokogiri::XML::Attr
|
39
|
+
temp_hash[key.to_sym] = value.value.sanitize
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
a << temp_hash
|
44
|
+
end
|
45
|
+
return a
|
46
|
+
end
|
47
|
+
end # Nodeset
|
48
|
+
end # XML
|
49
|
+
end # Nokogiri
|
50
|
+
end # Jaleb
|
51
|
+
|
52
|
+
::Nokogiri::HTML::Document.send(:include, ::Jaleb::Nokogiri::HTML::Document)
|
53
|
+
::Nokogiri::XML::NodeSet.send(:include, ::Jaleb::Nokogiri::XML::NodeSet)
|