RubyGems - queenshop - Versions diffs - 0.0.8 → 0.1.0 - Mend

queenshop 0.0.8 → 0.1.0

Files changed (6) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: d9d7d060414ffe7379cdc099d6a036208b41ce11
-  data.tar.gz: 51d806387e05f7247e5b364ec824a85daee6e60d
+  metadata.gz: 92d0b55a68d535fb78b45df74784fa6a0dc4ea76
+  data.tar.gz: a6fdda4008a9620f139f48e33739d0240a0d72c8
 SHA512:
-  metadata.gz: e466143cb2b9499c8a9655c19519294754be940c149e702218a1cd7960ae26164cf69eff0720a0c486c7bf3b89faee8e708f7f2e17ebcd0247efa97eac04ae9b
-  data.tar.gz: aba290a33903378a1459c78d5e729bf1c6710c6bc3bf9748ff17939e2308821ed37ffaf0cfff4ad76819bcb3818414739c2bc68e995990e4c0dc06cd379da3de
+  metadata.gz: 739665cda25b1a572ce0d788e2402cd2ef2bf3ce577a883c13403716006cf93517294f80de81a43a78b6101b28174b2514521f5c56593671b751671def7de2ea
+  data.tar.gz: 50128c72758a26f8b860862985cb2466556aa4b13d39be71b0e2184001cf2b6a08efd134f020988b7cc6504039238a3a3f1730013e20f0bd8b5db5f33375be01

data/bin/queenshop CHANGED

@@ -2,5 +2,34 @@
 # require 'queenshop' # for production
 require_relative '../lib/queenshop.rb' # for testing
-scraper = QueenShopScraper::Filter.new
-puts scraper.scrape(ARGV)
+@scraper = QueenShop::Scraper.new
+# command type keyword lprice hprice page_limit
+def parse_args argv
+  input_length = argv.length
+  abort 'invalid usage' unless input_length <= 5
+  if input_length == 0  # scrape main category
+    @scraper.scrape('latest')
+  elsif input_length == 1  # scrape main category
+    @scraper.scrape(argv[0])
+  elsif input_length == 2
+    t = argv[1].to_i
+    if t != 0
+      options = { page_limit: argv[1] }
+    else
+      options = { keyword: argv[1] }
+    end
+    @scraper.scrape(argv[0], options)
+  elsif input_length == 3
+    options = { keyword: argv[1], page_limit: argv[2] }
+    @scraper.scrape(argv[0], options)
+  elsif input_length == 5
+    options = { keyword: argv[2], page_limit: argv[5],
+      price_boundary: [argv[3], argv[4]]
+    }
+    @scraper.scrape_filter(argv[0], options)
+  end
+end
+puts parse_args ARGV

data/lib/queenshop.rb CHANGED

@@ -1,3 +1,2 @@
 #!/usr/bin/env ruby
-require_relative 'queenshop/config'
 require_relative 'queenshop/scraper'

data/lib/queenshop/scraper.rb CHANGED

@@ -2,82 +2,171 @@
 require 'oga'
 require 'iconv'
 require 'open-uri'
-require_relative './config'
 # scrape data
-module QueenShopScraper
-  # filter class basically uses xpath selectors to get attribs
-  class Filter
-    attr_reader :result
-    attr_writer :item_selector
-    attr_writer :title_selector
-    attr_writer :price_selector
-    attr_writer :site_url
+module QueenShop
+  # extract_data class uses xpath selectors to get attribs
+  class Scraper
+    BASE_URL        = 'https://queenshop.com.tw'
+    BASE_SCRAPE_URL = "#{BASE_URL}/m/PDList2.asp?"
+    LATEST_URI      = "#{BASE_SCRAPE_URL}item1=new"
+    DISCOUNT_URI    = "#{BASE_SCRAPE_URL}item1=dis"
+    POPULAR_URI     = "#{BASE_SCRAPE_URL}item1=pre"
+    TOPS_URI        = "#{BASE_SCRAPE_URL}brand=01&item1=00&item2=6"
+    PANTS_URI       = "#{BASE_SCRAPE_URL}brand=01&item1=01&item2=3"
+    ACCESSORIES_URI = "#{BASE_SCRAPE_URL}brand=01&item1=02&item2=2"
+    # xml selectors that will be used to scrape data
+    ITEM_SELECTOR   = "//div[@class='pditem']/div[@class='pdicon']"
+    TITLE_SELECTOR  = "div[@class='pdicon_name']/a"
+    IMAGE_SELECTOR  = "div[@class='pdicon_img']/a/img/@src"
+    PRICE_SELECTOR  = "div[@class='pdicon_price']/div[@style='font-weight:bold;']"
+    LINK_SELECTOR   = "div[@class='pdicon_name']/a/@href"
+    PAGES_SELECTOR  = "div[@class='divPageClone']/a/@href"
+    def latest(page, options = {})
+      uri  = uri_with_options(build_uri(LATEST_URI, options), page)
+      process_request(uri, options)
+    end
+    def popular(page, options = {})
+      uri  = uri_with_options(build_uri(LATEST_URI, options), page)
+      process_request(uri, options)
+    end
+    def tops(page, options = {})
+      uri  = uri_with_options(build_uri(LATEST_URI, options), page)
+      process_request(uri, options)
+    end
+    def pants(page, options = {})
+      uri  = uri_with_options(build_uri(LATEST_URI, options), page)
+      process_request(uri, options)
+    end
+    def accessories(page, options = {})
+      uri  = uri_with_options(build_uri(LATEST_URI, options), page)
+      process_request(uri, options)
+    end
+    def search(page, options = {})
+      uri  = uri_with_options(build_uri(BASE_SCRAPE_URL, options), page)
+      process_request(uri, options)
+    end
+    def scrape(type, options = {})
+      records = []
+      valid_args = [:tops, :popular, :pants, :pants,
+        :accessories, :latest, :search]
+      abort 'invalid parameter - scrape type' unless valid_args.include?(type.to_sym)
+      scrape_what(type, options)
+    end
     private
-    def get_xmldata(url)
-      raw_html = open(url)
-      ic = Iconv.new('UTF-8','big5')
-      data = ic.iconv(raw_html.read)
-      Oga.parse_html(data)
-    rescue StandardError
-      'error'
-    end
-    def fetch_result(uri = '')
-      url = @site_url + uri
-      # try to open the url
-      document = get_xmldata(url)
-      # hard return on an error
-      return [] unless document != 'error'
-      items = document.xpath(@item_selector)
-      # loop through the items and get the title and price
-      items.map do |item|
-        title = item.xpath(@title_selector).text()
-        price = item.xpath(@price_selector).text
-        strip_filter(title, price) if title.downcase.include? @item_filter
+    def process_request(uri, options)
+      body = open_uri(uri)
+      data = extract_data(body)
+      filter(data, options)
+    end
+    # filter by price if the options are not empty
+    def filter(data, options)
+      results = data
+      unless options.empty?
+        results = match_price(results, options[:price_boundary]) if options[:price_boundary]
       end
-      @result
+      results
     end
-    def strip_filter (title, price)
-      price = price.gsub!(/NT. /, '')
-      if !@price_filter.empty?
-        if eval("#{price} #{@price_filter}")
-          @result << { title: "#{title}", price: "#{price}" }
-        end
-      else
-        @result << { title: "#{title}", price: "#{price}" } unless title.empty?
+    # do the actual extraction of prices from the result set
+    def match_price(data, boundary)
+      lower_bound = boundary.first || 0
+      upper_bound = boundary.last  || Float::INFINITY
+      data.select { |item| lower_bound <= item[:price] && item[:price] <= upper_bound }
+    end
+    def build_uri(uri, options = {})
+      opts = { uri: uri }
+      unless options.empty?
+        opts[:keyword] = options[:keyword] if options[:keyword]
       end
+      opts
+    end
+    def uri_with_options(options = {}, page)
+      uri = ''
+      unless options.empty?
+        kw = options[:keyword] || nil
+        ic = Iconv.new('big5','UTF-8')
+        keyword = ic.iconv(kw)
+        uri << "#{options[:uri]}&pageno=#{page}" if options[:uri]
+        uri << "br=X&keyword=#{URI.escape(keyword)}" if options[:keyword]
+      end
+      uri
     end
-    public
+    # try open the URL, fail on error
+    def open_uri(uri)
+      open(uri) {|file| file.read}
+    rescue StandardError
+      'error opening site url'
+    end
-    def initialize
-      @result = []
-      # xml selectors that will be used to scrape data
-      @item_selector = "//div[@class=\'pditem\']/div[@class=\'pdicon\']"
-      @title_selector = "div[@class=\'pdicon_name\']/a"
-      @price_selector = "div[@class=\'pdicon_price\']/div[@style=\'font-weight:bold;\']"
-      @site_url = 'https://www.queenshop.com.tw/m/PDList2.asp?'
-      @price_filter = nil
+    # iterate over every element of item using xpath
+    def extract_data(raw)
+      Oga.parse_html(raw)
+         .xpath(ITEM_SELECTOR)
+         .map { |item| parse(item) }
     end
-    def scrape (params=[])
-      params = ARGV.empty? ? params : ARGV
-      conf = QConfig.new(params)
-      @price_filter = conf.parameters[:price]
-      @item_filter = conf.parameters[:item].downcase
+    # call methods to extract the data using xpath
+    def parse(item)
+      {
+        title:  extract_title(item),
+        price:  extract_price(item),
+        images: extract_images(item),
+        link:   extract_link(item)
+      }
+    end
-      conf.pages.map do |page|
-        paginated_uri = "&page=#{page}"
-        fetch_result(paginated_uri)
-      end
-      @result
+    # Iconv is neccessary here otherwise text is unreadable
+    def extract_title(item)
+      ic = Iconv.new('UTF-8','big5')
+      raw_title = item.xpath(TITLE_SELECTOR).text
+      ic.iconv(raw_title)
     end
+    # get rid of the NT and convert to integer
+    def extract_price(item)
+      item.xpath(PRICE_SELECTOR).text.sub(/NT. /, '').to_i
+    end
+    # extract two images and return array or urls
+    def extract_images(item)
+      image       = item.xpath(IMAGE_SELECTOR).text
+      image_hover = image.sub(/\.jpg/, '-h.jpg')
+      image_hover = image.sub(/\.png/, '-h.png') unless image_hover != image
+      ["#{BASE_URL}#{image}", "#{BASE_URL}#{image_hover}"]
+    end
+    # get the link to the item
+    def extract_link(item)
+      "#{BASE_URL}/#{item.xpath(LINK_SELECTOR).text}"
+    end
+    def scrape_what(type, options)
+      records = []
+      pl = options[:page_limit].to_i
+      page_limit = pl != 0 ? pl : 5
+      1.upto(page_limit) do |page|
+        method = self.method(type)
+        records.push(method.call(page, options))
+      end
+      records
+    end
   end
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: queenshop
 version: !ruby/object:Gem::Version
-  version: 0.0.8
+  version: 0.1.0
 platform: ruby
 authors:
 - Even Chang
@@ -11,7 +11,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-11-22 00:00:00.000000000 Z
+date: 2016-01-02 00:00:00.000000000 Z
 dependencies: []
 description: This is a gem scraping queenshop's website and returns the items with
   corresponding prices
@@ -27,7 +27,6 @@ extra_rdoc_files: []
 files:
 - bin/queenshop
 - lib/queenshop.rb
-- lib/queenshop/config.rb
 - lib/queenshop/scraper.rb
 homepage: http://rubygems.org/gems/queenshop
 licenses:

data/lib/queenshop/config.rb DELETED

@@ -1,58 +0,0 @@
-#!/usr/bin/env ruby
-# this class takes care of
-# parsing the parameters
-module Validate
-  attr_reader :parameters
-  attr_reader :pages
-  VALID_ARGS = [:item, :price, :pages]
-  def validate_args(args)
-    @parameters = {item: '', price: '', pages: '1..7'}
-    args.each do |arg|
-      begin
-        match = /(?<key>.*?)=(?<value>.*)/.match(arg)
-        fail unless VALID_ARGS.include?(match[:key].to_sym)
-        value = check(match)
-        @parameters[match[:key].to_sym] = value
-      rescue StandardError
-        abort "invalid usage...\n" << usage << "\n\n"
-      end
-    end
-  end # end validate_args
-  def check(match)
-    value = match[:value]
-    fail unless value =~ /^(>|<|>=|<=|==)\d*.\d*?$/ if match[:key].to_sym.eql?(:price)
-    # Float(value) if match[:key].to_sym.eql?(:price)
-    fail unless value =~ /^\d*([.]{2}\d*)?$/ if match[:key].to_sym.eql?(:pages)
-    value
-  rescue StandardError
-      abort "invalid parameters"
-  end
-  def pages
-    first_page = @parameters[:pages].scan(/\d+/).first.to_i
-    last_page = @parameters[:pages].scan(/\d+/).last.to_i
-    @pages = *(first_page..last_page)
-  end
-  def usage
-    'Usage: queenshop [options]
-      item=(string)
-      price=(float[,float])
-      examples:
-              queenshop item="blouse" price=300
-              queenshop price=0,100
-              queenshop item="skirt"'
-  end
-end
-class QConfig
-  include Validate
-  def initialize (args)
-    validate_args (args)
-    pages
-  end
-end