RubyGems - stylemooncat - Versions diffs - 0.0.1 - Mend

stylemooncat 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml +7 -0
data/bin/stylemooncat +6 -0
data/lib/stylemooncat.rb +2 -0
data/lib/stylemooncat/scraper.rb +156 -0
metadata +55 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 8c7e37b6424726cdec5907fa1fed3985228a2eca
+  data.tar.gz: 2c42777971df54b19607c98cb352966f8c8313ce
+SHA512:
+  metadata.gz: 1cae3ab2235f3022a2d54712b0591191dbb467aadcfb85ff80ad2e3505c5b4eaf839a907e32f803a5926417614e54869978fbaa36a6e0582be51ada006a0831a
+  data.tar.gz: db75b57f6f91455db0cf1c5ff3df22b0d86b495c882c8f0b2bef3c7ca5c4765389a9c1d947950bc38117d438bc958b8ded05a2e2f0d797605c3c4faa6c69554d

data/bin/stylemooncat ADDED Viewed

@@ -0,0 +1,6 @@
+#!/usr/bin/env ruby
+require_relative '../lib/stylemooncat.rb'
+@scraper = StyleMoonCat::Scraper.new
+puts @scraper.get_top(ARGV[0])

data/lib/stylemooncat.rb ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ #!/usr/bin/env ruby
2	+ require_relative 'stylemooncat/scraper'

data/lib/stylemooncat/scraper.rb ADDED Viewed

@@ -0,0 +1,156 @@
+#!/usr/bin/env ruby
+require 'oga'
+require 'open-uri'
+require 'open-uri-s3'
+# scrape data
+module StyleMoonCat
+  class Scraper
+  # URI
+    @@BASE_URI = 'http://www.stylemooncat.com.tw'
+    @@NEW_ARRIVALS_URI = "#{@@BASE_URI}/PDList.asp?recommand=1312090001"
+    @@LAST_WEEK_URI = "#{@@BASE_URI}/PDList.asp?recommand=1312090002"
+    @@SPECIAL_DISCOUNT_URI = "#{@@BASE_URI}/PDList.asp?recommand=1312090003"
+    @@TOP_URI  = "#{@@BASE_URI}/PDList.asp?p1=01"
+    @@BOTTOM_URI = "#{@@BASE_URI}/PDList.asp?p1=02"
+    @@OUTER_URI = "#{@@BASE_URI}/PDList.asp?p1=03"
+    @@DRESS_URI = "#{@@BASE_URI}/PDList.asp?p1=04"
+    @@SHOES_URI = "#{@@BASE_URI}/PDList.asp?p1=05&p2=01"
+    @@BAG_URI = "#{@@BASE_URI}/PDList.asp?p1=05&p2=02"
+    @@ACCESSORIES_URI = "#{@@BASE_URI}/PDList.asp?p1=06"
+  # Selectors
+    @@ITEM_XPATH      = "//div[contains(@class, 'goodsBox')]/div[contains(@class, 'goodl')]"
+    @@LINK_XPATH      = 'a'
+    @@IMAGE_XPATH    = "a/img"
+    @@TITLE_XPATH     = "div[contains(@class, 'pd_info_l')]"    # /div[contains(@class, 'pd_info_l')]   is wrong
+    @@PRICE_SPAN_XPATH    = "div[contains(@class, 'pd_info_l')]/span"
+    @@PRICE_STRIKE_XPATH    = "div[contains(@class, 'pd_info_l')]/strike"
+    # Regular ?
+    @@TITLE_REGEX = /([．\p{Han}[a-zA-Z]]+)/
+    def get_new_arrival(page)
+      uri  = uri_with_page(@@NEW_ARRIVALS_URI, page)
+      body = fetch_data(uri)
+      filter(body)
+    end
+    def get_last_week(page)
+      uri  = uri_with_page(@@LAST_WEEK_URI, page)
+      body = fetch_data(uri)
+      filter(body)
+    end
+    def get_special_discount(page)
+      uri  = uri_with_page(@@SPECIAL_DISCOUNT_URI, page)
+      body = fetch_data(uri)
+      filter(body)
+    end
+    def get_top(page)
+      uri  = uri_with_page(@@TOP_URI, page)
+      body = fetch_data(uri)
+      filter(body)
+    end
+    def get_bottom(page)
+      uri  = uri_with_page(@@BOTTOM_URI, page)
+      body = fetch_data(uri)
+      filter(body)
+    end
+    def get_outer(page)
+      uri  = uri_with_page(@@OUTER_URI, page)
+      body = fetch_data(uri)
+      filter(body)
+    end
+    def get_dress(page)
+      uri  = uri_with_page(@@DRESS_URI, page)
+      body = fetch_data(uri)
+      filter(body)
+    end
+    def get_shoes(page)
+      uri  = uri_with_page(@@SHOES_URI, page)
+      body = fetch_data(uri)
+      filter(body)
+    end
+    def get_bag(page)
+      uri  = uri_with_page(@@BAG_URI, page)
+      body = fetch_data(uri)
+      filter(body)
+    end
+    def get_accessories(page)
+      uri  = uri_with_page(@@ACCESSORIES_URI, page)
+      body = fetch_data(uri)
+      filter(body)
+    end
+    private
+    def uri_with_page(uri, page)
+      "#{uri}&pageno=#{page}"
+    end
+    def fetch_data(uri)
+      puts uri
+      open(uri) {|file| file.read}
+    end
+    def filter(raw)
+    #  puts Oga.parse_html(raw).xpath(@@ITEM_XPATH).map { |item| parse(item) }
+      Oga.parse_html(raw)
+         .xpath(@@ITEM_XPATH)
+         .map { |item| parse(item) }
+    end
+    def parse(item)
+      {
+        title:  extract_title(item),
+        price:  extract_price(item),
+        images: extract_images(item),
+        link:   extract_link(item)
+      }
+    end
+    def extract_title(item)
+        item.xpath(@@TITLE_XPATH).text.split("TWD")[0]
+    end
+    def extract_price(item)
+      # if there is discount, priceString format is "originPirce sellingPrice"
+      # .split(' ') is fail. so use this method to extract sellingPrice
+      priceString = item.xpath(@@TITLE_XPATH).text.split("TWD.")[1]
+      length = priceString.length
+      if length ==8 || length ==9  #ex: priceString ==  "1200 990"   or "1200 1100"
+          space = priceString[4]
+          result = priceString.split(space)[1]
+      elsif length ==7 || length ==6 #ex: priceString == "999 990"  or   "120 99"
+          space = priceString[3]
+          result = priceString.split(space)[1]
+      elsif length ==5 #ex: priceString == "99 90"
+            space = priceString[2]
+            result = priceString.split(space)[1]
+      else #no discount
+            result = priceString
+      end
+      puts result
+      result
+    end
+    def extract_images(item)
+      item.xpath(@@IMAGE_XPATH).attribute(:src).first.value
+    end
+    def extract_link(item)
+      "#{@@BASE_URI}/#{item.xpath(@@LINK_XPATH).attribute(:href).first.value}"
+    end
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,55 @@
+--- !ruby/object:Gem::Specification
+name: stylemooncat
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Even Chang
+- Luis Herrera
+- Katy Lee
+- Frank Lee
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2015-12-19 00:00:00.000000000 Z
+dependencies: []
+description: This is a gem scraping StyleMoonCat's website and returns certain category's
+  items with title,price,image,and link
+email:
+- kiki44552002@gmail.com
+- lmherrera86@gmail.com
+- katylee41024@yahoo.com.tw
+- frank1234211@gmail.com
+executables:
+- stylemooncat
+extensions: []
+extra_rdoc_files: []
+files:
+- bin/stylemooncat
+- lib/stylemooncat.rb
+- lib/stylemooncat/scraper.rb
+homepage: http://rubygems.org/gems/stylemooncat
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.4.6
+signing_key:
+specification_version: 4
+summary: Scraper for StyleMoonCat
+test_files: []