jaleb 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +12 -0
- data/.rspec +2 -0
- data/.travis.yml +5 -0
- data/Gemfile +19 -0
- data/LICENSE.txt +21 -0
- data/README.md +91 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/jaleb.gemspec +29 -0
- data/last_response +5615 -0
- data/lib/jaleb.rb +68 -0
- data/lib/jaleb/data.rb +22 -0
- data/lib/jaleb/model.rb +21 -0
- data/lib/jaleb/models/amazon.rb +51 -0
- data/lib/jaleb/models/base.rb +26 -0
- data/lib/jaleb/models/ebay.rb +32 -0
- data/lib/jaleb/models/etsy.rb +31 -0
- data/lib/jaleb/models/gamecouk.rb +40 -0
- data/lib/jaleb/models/googleshopping.rb +31 -0
- data/lib/jaleb/models/newegg.rb +33 -0
- data/lib/jaleb/models/souq.rb +32 -0
- data/lib/jaleb/models/thinkgeek.rb +31 -0
- data/lib/jaleb/nokogiri.rb +53 -0
- data/lib/jaleb/string.rb +33 -0
- data/lib/jaleb/version.rb +3 -0
- metadata +113 -0
data/lib/jaleb.rb
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
# = Jaleb
|
2
|
+
# Author:: Hasan Basheer - 2017
|
3
|
+
require "uri"
|
4
|
+
require_relative "jaleb/version"
|
5
|
+
require_relative "jaleb/data"
|
6
|
+
require_relative "jaleb/model"
|
7
|
+
require_relative "jaleb/string"
|
8
|
+
require_relative "jaleb/nokogiri"
|
9
|
+
|
10
|
+
|
11
|
+
module Jaleb
|
12
|
+
LIBRARY_PATH = File.join(File.dirname(__FILE__), 'jaleb')
|
13
|
+
MODEL_PATH = File.join(LIBRARY_PATH, 'models')
|
14
|
+
|
15
|
+
# Module Methods
|
16
|
+
class << self
|
17
|
+
# Fetch information based on url
|
18
|
+
def fetch(url)
|
19
|
+
# Look for model based on url
|
20
|
+
model_class = Jaleb::Model.identify(url)
|
21
|
+
|
22
|
+
# Use Base class for fallback
|
23
|
+
model_class = Jaleb::Model::Base unless model_class
|
24
|
+
|
25
|
+
data = Jaleb::Data.read(url)
|
26
|
+
|
27
|
+
product = model_class.new
|
28
|
+
product.parse(data)
|
29
|
+
|
30
|
+
# Save url
|
31
|
+
product.url = url
|
32
|
+
|
33
|
+
return product
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
# get array of models syms
|
38
|
+
def models
|
39
|
+
models = Array.new
|
40
|
+
Dir[File.join(File.dirname(__FILE__), "jaleb", "models", "*.rb")].each do |f|
|
41
|
+
model = File.basename(f, ".rb").to_sym
|
42
|
+
models << model unless model == :base
|
43
|
+
end
|
44
|
+
return models
|
45
|
+
end
|
46
|
+
|
47
|
+
# get array of model classes
|
48
|
+
# Jaleb.model_classes = [Amazon, Ebay, ThinkGeek]
|
49
|
+
def model_classes
|
50
|
+
models = Array.new
|
51
|
+
Dir[File.join(File.dirname(__FILE__), "jaleb", "models", "*.rb")].each do |f|
|
52
|
+
filename = File.basename(f, ".rb")
|
53
|
+
models << ["Jaleb", "Model", filename.camelize].join("::").constantize unless filename == "base"
|
54
|
+
end
|
55
|
+
return models
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
|
61
|
+
# Autoload Models
|
62
|
+
module Model
|
63
|
+
autoload :Base, File.join(MODEL_PATH, "base")
|
64
|
+
for model in Jaleb.models
|
65
|
+
autoload model.to_s.camelize.to_sym, File.join(MODEL_PATH, model.to_s)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
data/lib/jaleb/data.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
require "nokogiri"
|
2
|
+
require "open-uri"
|
3
|
+
|
4
|
+
|
5
|
+
module Jaleb
|
6
|
+
# this class is responsible for fetching and parsing data
|
7
|
+
class Data
|
8
|
+
# Get read url and get data object
|
9
|
+
def self.read(url )
|
10
|
+
opt= {}
|
11
|
+
opt['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36'
|
12
|
+
response = open(url, opt)
|
13
|
+
doc = ::Nokogiri::HTML(response.read)
|
14
|
+
# Save contents of URL/Remote File for debugging
|
15
|
+
response.rewind
|
16
|
+
last_response_file = File.expand_path(File.join("..", "..", "last_response"), File.dirname(__FILE__))
|
17
|
+
File.new(last_response_file, "w+").write(response.read)
|
18
|
+
return doc
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
data/lib/jaleb/model.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
module Jaleb
|
2
|
+
module Model
|
3
|
+
class << self
|
4
|
+
#Detect model by url
|
5
|
+
def identify(url)
|
6
|
+
if url =~ ::URI.regexp
|
7
|
+
uri = ::URI::parse(url)
|
8
|
+
host = uri.host
|
9
|
+
match_model = nil
|
10
|
+
for model in Jaleb.model_classes
|
11
|
+
match_model = model if host =~ model.regexp
|
12
|
+
end
|
13
|
+
return match_model
|
14
|
+
|
15
|
+
else
|
16
|
+
raise AgrumentError , "not a url"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module Jaleb module Model
|
2
|
+
class Amazon < Jaleb::Model::Base # A regular expression for determining if a url comes from a specific service/website
|
3
|
+
def self.regexp
|
4
|
+
/amazon\.com/
|
5
|
+
end
|
6
|
+
|
7
|
+
# Parse data and look for object attributes to give to object
|
8
|
+
def parse(data)
|
9
|
+
super(data)
|
10
|
+
|
11
|
+
case doc
|
12
|
+
when Nokogiri::HTML::Document
|
13
|
+
# Get Name
|
14
|
+
self.name = doc.css("h1#title").first_string
|
15
|
+
self.name = doc.xpath("string(//title)").split(" - ").first unless self.name
|
16
|
+
|
17
|
+
# Get Description
|
18
|
+
self.description = doc.css(".productDescriptionWrapper").first_string
|
19
|
+
|
20
|
+
# Get description from meta title if not found
|
21
|
+
self.description = doc.xpath("//meta[@name='description']/@content").first_string if description.nil?
|
22
|
+
|
23
|
+
# Get Price
|
24
|
+
parse_price(doc.css("#actualPriceValue").first_string)
|
25
|
+
parse_price(doc.css("#priceblock_ourprice").first_string) unless self.price
|
26
|
+
parse_price(doc.css("#priceblock_saleprice").first_string) unless self.price
|
27
|
+
parse_price(doc.xpath("//span[contains(@id, 'price')]").first_string) unless self.price
|
28
|
+
|
29
|
+
# Get Unqualified Price
|
30
|
+
parse_price(doc.xpath("//*[contains(@id, 'unqualifiedBuyBox')]//span").first_string) unless self.price
|
31
|
+
|
32
|
+
# Get Used Price
|
33
|
+
parse_price(doc.xpath("//*[contains(@id, 'secondaryUsedAndNew')]//*[@class='price']").first_string) unless self.price
|
34
|
+
|
35
|
+
|
36
|
+
# Get Images
|
37
|
+
self.images = doc.xpath("//*[@data-action='main-image-click']//img").attribute_array
|
38
|
+
self.images = doc.xpath("//*[@id='imageBlock']//img").attribute_array unless self.images
|
39
|
+
|
40
|
+
# Get images for in-house products (kindle, etc.)
|
41
|
+
self.images = doc.xpath("//*[@id='kib-ma-container-0']//img").attribute_array if self.images.empty?
|
42
|
+
|
43
|
+
# Get images for third-party sellers
|
44
|
+
self.images = doc.xpath("//*[@id='prodImageContainer']//img").attribute_array if self.images.empty?
|
45
|
+
|
46
|
+
self.image = images.first
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require "hashie"
|
2
|
+
require "money"
|
3
|
+
require "monetize"
|
4
|
+
|
5
|
+
module Jaleb
|
6
|
+
module Model
|
7
|
+
class Base < ::Hashie::Mash
|
8
|
+
# Parse data and set object attributes
|
9
|
+
def parse(data)
|
10
|
+
self.doc = data # save data for if user wants to access it later
|
11
|
+
|
12
|
+
# Get page title as name
|
13
|
+
self.name = doc.css("head > title").first_string
|
14
|
+
end
|
15
|
+
|
16
|
+
# Parse a raw price string to get usable data
|
17
|
+
# model.parse_price("$5.00") # => #<Money cents:500 currency:USD>
|
18
|
+
# model.price.to_f # => 5.0
|
19
|
+
# model.price.currency.symbol # => '$'
|
20
|
+
def parse_price(raw_price = nil)
|
21
|
+
return if raw_price.nil?
|
22
|
+
self.price = ::Monetize.parse(raw_price)
|
23
|
+
end
|
24
|
+
end # Baseasd
|
25
|
+
end # Model
|
26
|
+
end # Jaleb
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Jaleb
|
2
|
+
module Model
|
3
|
+
class Ebay < Jaleb::Model::Base
|
4
|
+
# A regular expression for determining if a url comes from a specific service/website
|
5
|
+
def self.regexp
|
6
|
+
/ebay\.com/
|
7
|
+
end
|
8
|
+
|
9
|
+
# Parse data and look for object attributes to give to object
|
10
|
+
def parse(data)
|
11
|
+
super(data)
|
12
|
+
|
13
|
+
case doc
|
14
|
+
when Nokogiri::HTML::Document
|
15
|
+
# Get Name
|
16
|
+
self.name = doc.xpath("//h1[@itemprop='name']").first_string
|
17
|
+
|
18
|
+
# Get Description
|
19
|
+
# OMITTED: This is tough to get because ebay item descriptions are custom html/content created by sellers
|
20
|
+
|
21
|
+
# Get Price
|
22
|
+
raw_price = doc.xpath("//span[@itemprop='price']").first_string
|
23
|
+
parse_price(raw_price.gsub(/US/, "")) if raw_price
|
24
|
+
|
25
|
+
# Get Image
|
26
|
+
self.images = [{:src => doc.xpath("//span[@itemprop='image']/img").first_string}]
|
27
|
+
self.image = images.first
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Jaleb
|
2
|
+
module Model
|
3
|
+
class Etsy < Jaleb::Model::Base
|
4
|
+
# A regular expression for determining if a url comes from a specific service/website
|
5
|
+
def self.regexp
|
6
|
+
/etsy\.com/
|
7
|
+
end
|
8
|
+
|
9
|
+
# Parse data and look for object attributes to give to object
|
10
|
+
def parse(data)
|
11
|
+
super(data)
|
12
|
+
|
13
|
+
case doc
|
14
|
+
when Nokogiri::HTML::Document
|
15
|
+
# Get Name
|
16
|
+
self.name = doc.xpath("//span[@itemprop='name']").first_string
|
17
|
+
|
18
|
+
# Get Description
|
19
|
+
self.description = doc.xpath("//div[@id='description-text']").first_string
|
20
|
+
|
21
|
+
# Get Price
|
22
|
+
parse_price(doc.css("span#listing-price").first_string)
|
23
|
+
|
24
|
+
# Get Images
|
25
|
+
self.images = doc.xpath("//div[@id='image-main']//img").attribute_array
|
26
|
+
self.image = images.first
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Jaleb
|
2
|
+
module Model
|
3
|
+
class Gamecouk < Jaleb::Model::Base
|
4
|
+
# A regular expression for determining if a url comes from a specific service/website
|
5
|
+
def self.regexp
|
6
|
+
/game\.co\.uk/
|
7
|
+
end
|
8
|
+
|
9
|
+
# Parse data and look for object attributes to give to object
|
10
|
+
def parse(data)
|
11
|
+
super(data)
|
12
|
+
|
13
|
+
case doc
|
14
|
+
when Nokogiri::HTML::Document
|
15
|
+
require 'date'
|
16
|
+
# Get Name
|
17
|
+
self.name = doc.xpath("//meta[@property='og:title']/@content").first_string
|
18
|
+
|
19
|
+
# Get Description
|
20
|
+
# OMITTED: This is tough to get because ebay item descriptions are custom html/content created by sellers
|
21
|
+
|
22
|
+
self.description = doc.css('div#primary div#details.panel div.description').to_s
|
23
|
+
|
24
|
+
self.release_date = doc.css('div#primary div#details.panel p.releaseDate').first_string
|
25
|
+
|
26
|
+
self.release_date = Date.strptime(self.release_date.gsub('Released on ',''), '%d-%b-%Y') if self.release_date
|
27
|
+
|
28
|
+
# Get Price
|
29
|
+
raw_price = doc.css("span.price").first_string
|
30
|
+
Money.default_currency = Money::Currency.new("GBP")
|
31
|
+
parse_price(raw_price.gsub(/Only /, "")) if raw_price
|
32
|
+
|
33
|
+
# Get Image
|
34
|
+
self.images = [{:src => doc.xpath("//meta[@property='og:image']/@content").first_string}]
|
35
|
+
self.image = images.first
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Jaleb
|
2
|
+
module Model
|
3
|
+
class Googleshopping < Jaleb::Model::Base
|
4
|
+
# A regular expression for determining if a url comes from a specific service/website
|
5
|
+
def self.regexp
|
6
|
+
/google\.com/
|
7
|
+
end
|
8
|
+
|
9
|
+
# Parse data and look for object attributes to give to object
|
10
|
+
def parse(data)
|
11
|
+
super(data)
|
12
|
+
case doc
|
13
|
+
when Nokogiri::HTML::Document
|
14
|
+
# Get Name
|
15
|
+
name = doc.css('h1#product-name span.main-title').first_string
|
16
|
+
self.name = name if name
|
17
|
+
|
18
|
+
# Get Description
|
19
|
+
self.description = doc.css("#product-description-full").first_string
|
20
|
+
|
21
|
+
# Get Price
|
22
|
+
parse_price(doc.css('#summary-prices .price').first_string)
|
23
|
+
|
24
|
+
# Get Images
|
25
|
+
self.images = doc.css('div#product-basic-info img').attribute_array
|
26
|
+
self.image = images.first
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Jaleb
|
2
|
+
module Model
|
3
|
+
class Newegg < Jaleb::Model::Base
|
4
|
+
# A regular expression for determining if a url comes from a specific service/website
|
5
|
+
def self.regexp
|
6
|
+
/newegg\.com/
|
7
|
+
end
|
8
|
+
|
9
|
+
# Parse data and look for object attributes to give to object
|
10
|
+
def parse(data)
|
11
|
+
super(data)
|
12
|
+
case doc
|
13
|
+
when Nokogiri::HTML::Document
|
14
|
+
# Get Name
|
15
|
+
self.name = doc.css("#synopsis .grpDesc .wrapper h1 > span").first_string
|
16
|
+
#self.name = doc.css("#synopsis .grpDesc .wrapper > .span").first_string
|
17
|
+
|
18
|
+
# Description - Not always reliable.
|
19
|
+
self.description = doc.css(".itmDesc > p").first_string
|
20
|
+
|
21
|
+
# Get Price (currently disabled because price is displayed after page load)
|
22
|
+
#parse_price doc.css("li.price-current").attribute("content").content
|
23
|
+
|
24
|
+
# # Get Images
|
25
|
+
self.images = [{:src => doc.css('.mainSlide > img').attribute("src").content}]
|
26
|
+
self.image = images.first
|
27
|
+
end
|
28
|
+
rescue
|
29
|
+
return nil
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Jaleb
|
2
|
+
module Model
|
3
|
+
class Souq < Jaleb::Model::Base
|
4
|
+
# A regular expression for determining if a url comes from a specific service/website
|
5
|
+
def self.regexp
|
6
|
+
/souq\.com/
|
7
|
+
end
|
8
|
+
|
9
|
+
# Parse data and look for object attributes to give to object
|
10
|
+
def parse(data)
|
11
|
+
super(data)
|
12
|
+
|
13
|
+
case doc
|
14
|
+
when Nokogiri::HTML::Document
|
15
|
+
# Get Name
|
16
|
+
self.name = doc.xpath("//meta[@property='og:title']/@content").first_string
|
17
|
+
|
18
|
+
# Get Description
|
19
|
+
self.description = doc.xpath("//meta[@property='og:description']/@content").first_string
|
20
|
+
|
21
|
+
# Get Price
|
22
|
+
raw_price = doc.css('h3.price').first_string
|
23
|
+
parse_price(raw_price.gsub(/,/, "")) if raw_price
|
24
|
+
|
25
|
+
# Get Images
|
26
|
+
self.images = [{:src => doc.xpath("//meta[@property='og:image']/@content").first_string}]
|
27
|
+
self.image = images.first
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Jaleb
|
2
|
+
module Model
|
3
|
+
class Thinkgeek < Jaleb::Model::Base
|
4
|
+
# A regular expression for determining if a url comes from a specific service/website
|
5
|
+
def self.regexp
|
6
|
+
/thinkgeek\.com/
|
7
|
+
end
|
8
|
+
|
9
|
+
# Parse data and look for object attributes to give to object
|
10
|
+
def parse(data)
|
11
|
+
super(data)
|
12
|
+
|
13
|
+
case doc
|
14
|
+
when Nokogiri::HTML::Document
|
15
|
+
# Get Name
|
16
|
+
self.name = doc.xpath("//meta[@property='og:title']/@content").first_string
|
17
|
+
|
18
|
+
# Get Description
|
19
|
+
self.description = doc.xpath("//meta[@property='og:description']/@content").first_string
|
20
|
+
|
21
|
+
# Get Price
|
22
|
+
parse_price(doc.xpath("//form[@id='buy']/h3").first_string) rescue nil
|
23
|
+
|
24
|
+
# Get Images
|
25
|
+
self.images = [{:src => doc.xpath("//meta[@property='og:image']/@content").first_string}]
|
26
|
+
self.image = images.first
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require "cgi"
|
2
|
+
|
3
|
+
module Jaleb
|
4
|
+
# This contains helper methods for Nokogiri interactions
|
5
|
+
module Nokogiri
|
6
|
+
module HTML
|
7
|
+
module Document
|
8
|
+
|
9
|
+
end # Document
|
10
|
+
end # HTML
|
11
|
+
|
12
|
+
module XML
|
13
|
+
module NodeSet
|
14
|
+
# get string from first nodeset model
|
15
|
+
def first_string
|
16
|
+
node = first
|
17
|
+
case node
|
18
|
+
# xml/html element?
|
19
|
+
when ::Nokogiri::XML::Element
|
20
|
+
return node.content.sanitize
|
21
|
+
# xml/html attribute?
|
22
|
+
when ::Nokogiri::XML::Attr
|
23
|
+
return node.value.sanitize
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# convert nodeset models to an array of hashes
|
28
|
+
# @doc.xpath("//img")).attribute_array # => [{:element => "img", :src => ".../someimage.png"}]
|
29
|
+
def attribute_array
|
30
|
+
a = Array.new
|
31
|
+
each do |node|
|
32
|
+
temp_hash = Hash.new
|
33
|
+
case node
|
34
|
+
when ::Nokogiri::XML::Element
|
35
|
+
temp_hash[:element] = node.name
|
36
|
+
node.attributes.each do |key, value|
|
37
|
+
case value
|
38
|
+
when ::Nokogiri::XML::Attr
|
39
|
+
temp_hash[key.to_sym] = value.value.sanitize
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
a << temp_hash
|
44
|
+
end
|
45
|
+
return a
|
46
|
+
end
|
47
|
+
end # Nodeset
|
48
|
+
end # XML
|
49
|
+
end # Nokogiri
|
50
|
+
end # Jaleb
|
51
|
+
|
52
|
+
::Nokogiri::HTML::Document.send(:include, ::Jaleb::Nokogiri::HTML::Document)
|
53
|
+
::Nokogiri::XML::NodeSet.send(:include, ::Jaleb::Nokogiri::XML::NodeSet)
|