RubyGems - joyceshop - Versions diffs - 0.0.3 → 0.0.4 - Mend

joyceshop 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 2c4b143619c3fe55c48d39a9b419f39d6923fce0
-  data.tar.gz: 1b0fd41b0f9aff1067b680b43e2260a3f7f83cae
+  metadata.gz: 05a440353df86f1a449acbe8633aefd1139a0c55
+  data.tar.gz: 9cedcb43aca538bb30e5e1de8bf0a17d472e795e
 SHA512:
-  metadata.gz: 1b7810cbdc56d47096794d986417082dbe14dc84122e8d0539dbbc2e1f0e4c243f98ba451be45642fafb93fc0f053c0c258e8e4c6fcc6b288caded95c1b961c7
-  data.tar.gz: bd71e91f8adda7003cd73357e1eb2495c4e262d30e39169242bca461f4baaf0caffc5925849c0a5526f3a9fc3c438e2c05b3df9eb72170b1036e40ef063aa02f
+  metadata.gz: e2a8e4481761300cc527fc1103c377a54a1d547ee5d92a3a5fb14ebd60438e10427d8458625ec8484da37f4cb6a5253b464c124ff6a13c7859e67e8de472abfb
+  data.tar.gz: 986e94aa57f5ebe8c9470491079c0452c0e06aaa2886d3e767c0191ac3e5c51f07d6b19b462c48b7b20c1a8a5717a4f8e87b8a57ae0fe037a345f23bdf0d9ae2

data/bin/joyceshop CHANGED

@@ -2,6 +2,34 @@
 # require 'joyceshop' # for production
 require_relative '../lib/joyceshop.rb' # for testing
-scraper = JoyceShop::Scraper.new
-puts scraper.search('紗針織衫', {price_boundary: [100, 443]})
-puts scraper.scrape(:tops, 1)
+@scraper = JoyceShop::Scraper.new
+# command type keyword lprice hprice page_limit
+def parse_args argv
+  input_length = argv.length
+  abort 'invalid usage' unless input_length <= 5
+  if input_length == 0  # scrape main category
+    @scraper.scrape('latest')
+  elsif input_length == 1  # scrape main category
+    @scraper.scrape(argv[0])
+  elsif input_length == 2
+    t = argv[1].to_i
+    if t != 0
+      options = { page_limit: argv[1] }
+    else
+      options = { keyword: argv[1] }
+    end
+    @scraper.scrape(argv[0], options)
+  elsif input_length == 3
+    options = { keyword: argv[1], page_limit: argv[2] }
+    @scraper.scrape(argv[0], options)
+  elsif input_length == 5
+    options = { keyword: argv[2], page_limit: argv[5],
+      price_boundary: [argv[3], argv[4]]
+    }
+    @scraper.scrape_filter(argv[0], options)
+  end
+end
+puts parse_args ARGV

data/lib/joyceshop/scraper.rb CHANGED

@@ -5,104 +5,84 @@ require 'open-uri'
 # scrape data
 module JoyceShop
+  # extract_data class uses xpath selectors to get attribs
   class Scraper
-    # Types
-    @@VALID_TYPES = [:tops, :popular, :pants, :pants, :accessories, :latest]
-    # URI
-    @@BASE_URI        = 'https://www.joyce-shop.com'
-    @@LATEST_URI      = "#{@@BASE_URI}/PDList.asp?brand=01&item1=&item2=&ya19=&keyword=&recommand=1412170001&ob=F"
-    @@POPULAR_URI     = "#{@@BASE_URI}/PDList.asp?brand=01&item1=&item2=&ya19=&keyword=&recommand=1305080002&ob=F"
-    @@TOPS_URI        = "#{@@BASE_URI}/PDList.asp?brand=01&item1=110&item2=111&ya19=&keyword=&recommand=&ob=F"
-    @@PANTS_URI       = "#{@@BASE_URI}/PDList.asp?brand=01&item1=120&item2=121&ya19=&keyword=&recommand=&ob=F"
-    @@ACCESSORIES_URI = "#{@@BASE_URI}/PDList.asp?brand=01&item1=140&item2=141&ya19=&keyword=&recommand=&ob=F"
-    @@SEARCH_URI      = "#{@@BASE_URI}/PDList.asp?"
-    # Selectors
-    @@ITEM_SELECTOR      = "//div[contains(@class, 'NEW_shop_list')]/ul/li/div[contains(@class, 'NEW_shop_list_pic')]"
-    @@LINK_SELECTOR      = 'a[1]/@href'
-    @@IMAGE_SELECTOR     = "a/img[contains(@class, 'lazyload')]/@src"
-    @@ITEM_INFO_SELECTOR = "div[contains(@class, 'NEW_shop_list_info')]"
-    @@TITLE_SELECTOR     = "#{@@ITEM_INFO_SELECTOR}/div[1]"
-    @@PRICE_SELECTOR     = "#{@@ITEM_INFO_SELECTOR}/span"
+    BASE_URL        = 'https://www.joyce-shop.com'
+    BASE_SCRAPE_URL = "#{BASE_URL}/PDList.asp?"
+    LATEST_URI      = "#{BASE_SCRAPE_URL}brand=01&item1=&item2=&ya19=&keyword=&recommand=1412170001&ob=F"
+    POPULAR_URI     = "#{BASE_SCRAPE_URL}brand=01&item1=&item2=&ya19=&keyword=&recommand=1305080002&ob=F"
+    TOPS_URI        = "#{BASE_SCRAPE_URL}brand=01&item1=110&item2=111&ya19=&keyword=&recommand=&ob=F"
+    PANTS_URI       = "#{BASE_SCRAPE_URL}brand=01&item1=120&item2=121&ya19=&keyword=&recommand=&ob=F"
+    ACCESSORIES_URI = "#{BASE_SCRAPE_URL}brand=01&item1=140&item2=141&ya19=&keyword=&recommand=&ob=F"
+    # xml selectors that will be used to scrape data
+    ITEM_SELECTOR   = "//div[contains(@class, 'NEW_shop_list')]/ul/li/div[contains(@class, 'NEW_shop_list_pic')]"
+    ITEM_INFO_SELECTOR = "div[contains(@class, 'NEW_shop_list_info')]"
+    TITLE_SELECTOR  = "#{ITEM_INFO_SELECTOR}/div[1]"
+    IMAGE_SELECTOR  = "a/img[contains(@class, 'lazyload')]/@src"
+    PRICE_SELECTOR  = "#{ITEM_INFO_SELECTOR}/span"
+    LINK_SELECTOR   = "a[1]/@href"
     # Regular
-    @@TITLE_REGEX = /([．\p{Han}[a-zA-Z]]+)/
+    TITLE_REGEX = /([．\p{Han}[a-zA-Z]]+)/
-    def latest(page, options={})
-      uri  = uri_with_page(@@LATEST_URI, page)
-      body = fetch_data(uri)
-      data = parse_html(body)
-      filter(data, options)
+    def latest(page, options = {})
+      uri  = uri_with_options(build_uri(LATEST_URI, options), page)
+      process_request(uri, options)
     end
-    def popular(page, options={})
-      uri  = uri_with_page(@@POPULAR_URI, page)
-      body = fetch_data(uri)
-      data = parse_html(body)
-      filter(data, options)
+    def popular(page, options = {})
+      uri  = uri_with_options(build_uri(POPULAR_URI, options), page)
+      process_request(uri, options)
     end
-    def tops(page, options={})
-      uri  = uri_with_page(@@TOPS_URI, page)
-      body = fetch_data(uri)
-      data = parse_html(body)
-      filter(data, options)
+    def tops(page, options = {})
+      uri  = uri_with_options(build_uri(TOPS_URI, options), page)
+      process_request(uri, options)
     end
-    def pants(page, options={})
-      uri  = uri_with_page(@@PANTS_URI, page)
-      body = fetch_data(uri)
-      data = parse_html(body)
-      filter(data, options)
+    def pants(page, options = {})
+      uri  = uri_with_options(build_uri(PANTS_URI, options), page)
+      process_request(uri, options)
     end
-    def accessories(page, options={})
-      uri  = uri_with_page(@@ACCESSORIES_URI, page)
-      body = fetch_data(uri)
-      data = parse_html(body)
-      filter(data, options)
+    def accessories(page, options = {})
+      uri  = uri_with_options(build_uri(ACCESSORIES_URI, options), page)
+      process_request(uri, options)
     end
-    def search(keyword, options={})
-      uri  = uri_with_search(keyword)
-      body = fetch_data(uri)
-      data = parse_html(body)
-      filter(data, options)
+    def search(page, options = {})
+      uri  = uri_with_options(build_uri(BASE_SCRAPE_URL, options), page)
+      process_request(uri, options)
     end
-    def scrape(type, page, options = {})
-      abort "only supports #{@@VALID_TYPES}" unless @@VALID_TYPES.include?(type.to_sym)
-      method = self.method(type)
-      method.call(page, options)
+    def scrape(type, options = {})
+      records = []
+      valid_args = [:tops, :popular, :pants, :pants,
+        :accessories, :latest, :search]
+      abort 'invalid parameter - scrape type' unless valid_args.include?(type.to_sym)
+      scrape_what(type, options)
     end
     private
-    def uri_with_page(uri, page)
-      "#{uri}&pageno=#{page}"
-    end
-    def uri_with_search(keyword)
-      "#{@@SEARCH_URI}keyword=#{URI.escape(keyword)}"
-    end
-    def fetch_data(uri)
-      open(uri) { |file| file.read }
+    def process_request(uri, options)
+      body = open_uri(uri)
+      data = extract_data(body)
+      filter(data, options)
     end
-    # Filter
-    # ------------------------------------------------------------
+    # filter by price if the options are not empty
     def filter(data, options)
       results = data
       unless options.empty?
         results = match_price(results, options[:price_boundary]) if options[:price_boundary]
       end
       results
     end
+    # do the actual extraction of prices from the result set
     def match_price(data, boundary)
       lower_bound = boundary.first || 0
       upper_bound = boundary.last  || Float::INFINITY
@@ -110,14 +90,39 @@ module JoyceShop
       data.select { |item| lower_bound <= item[:price] && item[:price] <= upper_bound }
     end
-    # Parser
-    # ------------------------------------------------------------
-    def parse_html(raw)
+    def build_uri(uri, options = {})
+      opts = { uri: uri }
+      unless options.empty?
+        opts[:keyword] = options[:keyword] if options[:keyword]
+      end
+      opts
+    end
+    def uri_with_options(options = {}, page)
+      uri = ''
+      unless options.empty?
+        keyword = options[:keyword] || nil
+        uri << "#{options[:uri]}&pageno=#{page}" if options[:uri]
+        uri << "br=X&keyword=#{URI.escape(keyword)}" if options[:keyword]
+      end
+      uri
+    end
+    # try open the URL, fail on error
+    def open_uri(uri)
+      open(uri) {|file| file.read}
+    rescue StandardError
+      'error opening site url'
+    end
+    # iterate over every element of item using xpath
+    def extract_data(raw)
       Oga.parse_html(raw)
-         .xpath(@@ITEM_SELECTOR)
+         .xpath(ITEM_SELECTOR)
          .map { |item| parse(item) }
     end
+    # call methods to extract the data using xpath
     def parse(item)
       {
         title:  extract_title(item),
@@ -127,24 +132,41 @@ module JoyceShop
       }
     end
+    # Iconv is neccessary here otherwise text is unreadable
     def extract_title(item)
-      item.xpath(@@TITLE_SELECTOR).text
-          .scan(@@TITLE_REGEX)
+      item.xpath(TITLE_SELECTOR).text
+          .scan(TITLE_REGEX)
           .flatten[0]
     end
+    # get rid of the NT and convert to integer
     def extract_price(item)
-      item.xpath(@@PRICE_SELECTOR).text.to_i
+      item.xpath(PRICE_SELECTOR).text.sub(/NT. /, '').to_i
     end
+    # extract two images and return array or urls
     def extract_images(item)
-      image       = item.xpath(@@IMAGE_SELECTOR).text
+      image       = item.xpath(IMAGE_SELECTOR).text
       image_hover = image.sub(/\.jpg/, '-h.jpg')
-      ["#{@@BASE_URI}#{image}", "#{@@BASE_URI}#{image_hover}"]
+      image_hover = image.sub(/\.png/, '-h.png') unless image_hover != image
+      ["#{BASE_URL}#{image}", "#{BASE_URL}#{image_hover}"]
     end
+    # get the link to the item
     def extract_link(item)
-      "#{@@BASE_URI}/#{item.xpath(@@LINK_SELECTOR).text}"
+      "#{BASE_URL}/#{item.xpath(LINK_SELECTOR).text}"
+    end
+    def scrape_what(type, options)
+      records = []
+      pl = options[:page_limit].to_i
+      page_limit = pl != 0 ? pl : 5
+      1.upto(page_limit) do |page|
+        method = self.method(type)
+        records.push(method.call(page, options))
+      end
+      records.reject { |c| c.empty? }.flatten(1).uniq
     end
   end
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: joyceshop
 version: !ruby/object:Gem::Version
-  version: 0.0.3
+  version: 0.0.4
 platform: ruby
 authors:
 - Even Chang
@@ -11,7 +11,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-01-04 00:00:00.000000000 Z
+date: 2016-01-10 00:00:00.000000000 Z
 dependencies: []
 description: This is a gem scraping joyceshop's website and returns the popular/latest
   items