queenshop 0.0.8 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d9d7d060414ffe7379cdc099d6a036208b41ce11
4
- data.tar.gz: 51d806387e05f7247e5b364ec824a85daee6e60d
3
+ metadata.gz: 92d0b55a68d535fb78b45df74784fa6a0dc4ea76
4
+ data.tar.gz: a6fdda4008a9620f139f48e33739d0240a0d72c8
5
5
  SHA512:
6
- metadata.gz: e466143cb2b9499c8a9655c19519294754be940c149e702218a1cd7960ae26164cf69eff0720a0c486c7bf3b89faee8e708f7f2e17ebcd0247efa97eac04ae9b
7
- data.tar.gz: aba290a33903378a1459c78d5e729bf1c6710c6bc3bf9748ff17939e2308821ed37ffaf0cfff4ad76819bcb3818414739c2bc68e995990e4c0dc06cd379da3de
6
+ metadata.gz: 739665cda25b1a572ce0d788e2402cd2ef2bf3ce577a883c13403716006cf93517294f80de81a43a78b6101b28174b2514521f5c56593671b751671def7de2ea
7
+ data.tar.gz: 50128c72758a26f8b860862985cb2466556aa4b13d39be71b0e2184001cf2b6a08efd134f020988b7cc6504039238a3a3f1730013e20f0bd8b5db5f33375be01
@@ -2,5 +2,34 @@
2
2
  # require 'queenshop' # for production
3
3
  require_relative '../lib/queenshop.rb' # for testing
4
4
 
5
- scraper = QueenShopScraper::Filter.new
6
- puts scraper.scrape(ARGV)
5
+ @scraper = QueenShop::Scraper.new
6
+
7
+ # command type keyword lprice hprice page_limit
8
+ def parse_args argv
9
+ input_length = argv.length
10
+ abort 'invalid usage' unless input_length <= 5
11
+
12
+ if input_length == 0 # scrape main category
13
+ @scraper.scrape('latest')
14
+ elsif input_length == 1 # scrape main category
15
+ @scraper.scrape(argv[0])
16
+ elsif input_length == 2
17
+ t = argv[1].to_i
18
+ if t != 0
19
+ options = { page_limit: argv[1] }
20
+ else
21
+ options = { keyword: argv[1] }
22
+ end
23
+ @scraper.scrape(argv[0], options)
24
+ elsif input_length == 3
25
+ options = { keyword: argv[1], page_limit: argv[2] }
26
+ @scraper.scrape(argv[0], options)
27
+ elsif input_length == 5
28
+ options = { keyword: argv[2], page_limit: argv[5],
29
+ price_boundary: [argv[3], argv[4]]
30
+ }
31
+ @scraper.scrape_filter(argv[0], options)
32
+ end
33
+ end
34
+
35
+ puts parse_args ARGV
@@ -1,3 +1,2 @@
1
1
  #!/usr/bin/env ruby
2
- require_relative 'queenshop/config'
3
2
  require_relative 'queenshop/scraper'
@@ -2,82 +2,171 @@
2
2
  require 'oga'
3
3
  require 'iconv'
4
4
  require 'open-uri'
5
- require_relative './config'
6
5
 
7
6
  # scrape data
8
- module QueenShopScraper
9
- # filter class basically uses xpath selectors to get attribs
10
- class Filter
11
- attr_reader :result
12
- attr_writer :item_selector
13
- attr_writer :title_selector
14
- attr_writer :price_selector
15
- attr_writer :site_url
7
+ module QueenShop
8
+ # extract_data class uses xpath selectors to get attribs
9
+ class Scraper
10
+ BASE_URL = 'https://queenshop.com.tw'
11
+ BASE_SCRAPE_URL = "#{BASE_URL}/m/PDList2.asp?"
12
+
13
+ LATEST_URI = "#{BASE_SCRAPE_URL}item1=new"
14
+ DISCOUNT_URI = "#{BASE_SCRAPE_URL}item1=dis"
15
+ POPULAR_URI = "#{BASE_SCRAPE_URL}item1=pre"
16
+ TOPS_URI = "#{BASE_SCRAPE_URL}brand=01&item1=00&item2=6"
17
+ PANTS_URI = "#{BASE_SCRAPE_URL}brand=01&item1=01&item2=3"
18
+ ACCESSORIES_URI = "#{BASE_SCRAPE_URL}brand=01&item1=02&item2=2"
19
+
20
+ # xml selectors that will be used to scrape data
21
+ ITEM_SELECTOR = "//div[@class='pditem']/div[@class='pdicon']"
22
+ TITLE_SELECTOR = "div[@class='pdicon_name']/a"
23
+ IMAGE_SELECTOR = "div[@class='pdicon_img']/a/img/@src"
24
+ PRICE_SELECTOR = "div[@class='pdicon_price']/div[@style='font-weight:bold;']"
25
+ LINK_SELECTOR = "div[@class='pdicon_name']/a/@href"
26
+ PAGES_SELECTOR = "div[@class='divPageClone']/a/@href"
27
+
28
+ def latest(page, options = {})
29
+ uri = uri_with_options(build_uri(LATEST_URI, options), page)
30
+ process_request(uri, options)
31
+ end
32
+
33
+ def popular(page, options = {})
34
+ uri = uri_with_options(build_uri(LATEST_URI, options), page)
35
+ process_request(uri, options)
36
+ end
37
+
38
+ def tops(page, options = {})
39
+ uri = uri_with_options(build_uri(LATEST_URI, options), page)
40
+ process_request(uri, options)
41
+ end
42
+
43
+ def pants(page, options = {})
44
+ uri = uri_with_options(build_uri(LATEST_URI, options), page)
45
+ process_request(uri, options)
46
+ end
47
+
48
+ def accessories(page, options = {})
49
+ uri = uri_with_options(build_uri(LATEST_URI, options), page)
50
+ process_request(uri, options)
51
+ end
52
+
53
+ def search(page, options = {})
54
+ uri = uri_with_options(build_uri(BASE_SCRAPE_URL, options), page)
55
+ process_request(uri, options)
56
+ end
57
+
58
+ def scrape(type, options = {})
59
+ records = []
60
+ valid_args = [:tops, :popular, :pants, :pants,
61
+ :accessories, :latest, :search]
62
+ abort 'invalid parameter - scrape type' unless valid_args.include?(type.to_sym)
63
+ scrape_what(type, options)
64
+ end
16
65
 
17
66
  private
18
67
 
19
- def get_xmldata(url)
20
- raw_html = open(url)
21
- ic = Iconv.new('UTF-8','big5')
22
- data = ic.iconv(raw_html.read)
23
- Oga.parse_html(data)
24
- rescue StandardError
25
- 'error'
26
- end
27
-
28
- def fetch_result(uri = '')
29
- url = @site_url + uri
30
- # try to open the url
31
- document = get_xmldata(url)
32
- # hard return on an error
33
- return [] unless document != 'error'
34
-
35
- items = document.xpath(@item_selector)
36
- # loop through the items and get the title and price
37
- items.map do |item|
38
- title = item.xpath(@title_selector).text()
39
- price = item.xpath(@price_selector).text
40
- strip_filter(title, price) if title.downcase.include? @item_filter
68
+ def process_request(uri, options)
69
+ body = open_uri(uri)
70
+ data = extract_data(body)
71
+ filter(data, options)
72
+ end
73
+
74
+ # filter by price if the options are not empty
75
+ def filter(data, options)
76
+ results = data
77
+ unless options.empty?
78
+ results = match_price(results, options[:price_boundary]) if options[:price_boundary]
41
79
  end
42
- @result
80
+ results
43
81
  end
44
82
 
45
- def strip_filter (title, price)
46
- price = price.gsub!(/NT. /, '')
47
- if !@price_filter.empty?
48
- if eval("#{price} #{@price_filter}")
49
- @result << { title: "#{title}", price: "#{price}" }
50
- end
51
- else
52
- @result << { title: "#{title}", price: "#{price}" } unless title.empty?
83
+ # do the actual extraction of prices from the result set
84
+ def match_price(data, boundary)
85
+ lower_bound = boundary.first || 0
86
+ upper_bound = boundary.last || Float::INFINITY
87
+
88
+ data.select { |item| lower_bound <= item[:price] && item[:price] <= upper_bound }
89
+ end
90
+
91
+ def build_uri(uri, options = {})
92
+ opts = { uri: uri }
93
+ unless options.empty?
94
+ opts[:keyword] = options[:keyword] if options[:keyword]
53
95
  end
96
+ opts
97
+ end
54
98
 
99
+ def uri_with_options(options = {}, page)
100
+ uri = ''
101
+ unless options.empty?
102
+ kw = options[:keyword] || nil
103
+ ic = Iconv.new('big5','UTF-8')
104
+ keyword = ic.iconv(kw)
105
+ uri << "#{options[:uri]}&pageno=#{page}" if options[:uri]
106
+ uri << "br=X&keyword=#{URI.escape(keyword)}" if options[:keyword]
107
+ end
108
+ uri
55
109
  end
56
110
 
57
- public
111
+ # try open the URL, fail on error
112
+ def open_uri(uri)
113
+ open(uri) {|file| file.read}
114
+ rescue StandardError
115
+ 'error opening site url'
116
+ end
58
117
 
59
- def initialize
60
- @result = []
61
- # xml selectors that will be used to scrape data
62
- @item_selector = "//div[@class=\'pditem\']/div[@class=\'pdicon\']"
63
- @title_selector = "div[@class=\'pdicon_name\']/a"
64
- @price_selector = "div[@class=\'pdicon_price\']/div[@style=\'font-weight:bold;\']"
65
- @site_url = 'https://www.queenshop.com.tw/m/PDList2.asp?'
66
- @price_filter = nil
118
+ # iterate over every element of item using xpath
119
+ def extract_data(raw)
120
+ Oga.parse_html(raw)
121
+ .xpath(ITEM_SELECTOR)
122
+ .map { |item| parse(item) }
67
123
  end
68
124
 
69
- def scrape (params=[])
70
- params = ARGV.empty? ? params : ARGV
71
- conf = QConfig.new(params)
72
- @price_filter = conf.parameters[:price]
73
- @item_filter = conf.parameters[:item].downcase
125
+ # call methods to extract the data using xpath
126
+ def parse(item)
127
+ {
128
+ title: extract_title(item),
129
+ price: extract_price(item),
130
+ images: extract_images(item),
131
+ link: extract_link(item)
132
+ }
133
+ end
74
134
 
75
- conf.pages.map do |page|
76
- paginated_uri = "&page=#{page}"
77
- fetch_result(paginated_uri)
78
- end
79
- @result
135
+ # Iconv is neccessary here otherwise text is unreadable
136
+ def extract_title(item)
137
+ ic = Iconv.new('UTF-8','big5')
138
+ raw_title = item.xpath(TITLE_SELECTOR).text
139
+ ic.iconv(raw_title)
80
140
  end
81
141
 
142
+ # get rid of the NT and convert to integer
143
+ def extract_price(item)
144
+ item.xpath(PRICE_SELECTOR).text.sub(/NT. /, '').to_i
145
+ end
146
+
147
+ # extract two images and return array or urls
148
+ def extract_images(item)
149
+ image = item.xpath(IMAGE_SELECTOR).text
150
+ image_hover = image.sub(/\.jpg/, '-h.jpg')
151
+ image_hover = image.sub(/\.png/, '-h.png') unless image_hover != image
152
+ ["#{BASE_URL}#{image}", "#{BASE_URL}#{image_hover}"]
153
+ end
154
+
155
+ # get the link to the item
156
+ def extract_link(item)
157
+ "#{BASE_URL}/#{item.xpath(LINK_SELECTOR).text}"
158
+ end
159
+
160
+ def scrape_what(type, options)
161
+ records = []
162
+ pl = options[:page_limit].to_i
163
+ page_limit = pl != 0 ? pl : 5
164
+
165
+ 1.upto(page_limit) do |page|
166
+ method = self.method(type)
167
+ records.push(method.call(page, options))
168
+ end
169
+ records
170
+ end
82
171
  end
83
172
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: queenshop
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Even Chang
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2015-11-22 00:00:00.000000000 Z
14
+ date: 2016-01-02 00:00:00.000000000 Z
15
15
  dependencies: []
16
16
  description: This is a gem scraping queenshop's website and returns the items with
17
17
  corresponding prices
@@ -27,7 +27,6 @@ extra_rdoc_files: []
27
27
  files:
28
28
  - bin/queenshop
29
29
  - lib/queenshop.rb
30
- - lib/queenshop/config.rb
31
30
  - lib/queenshop/scraper.rb
32
31
  homepage: http://rubygems.org/gems/queenshop
33
32
  licenses:
@@ -1,58 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- # this class takes care of
4
- # parsing the parameters
5
- module Validate
6
- attr_reader :parameters
7
- attr_reader :pages
8
-
9
- VALID_ARGS = [:item, :price, :pages]
10
-
11
- def validate_args(args)
12
- @parameters = {item: '', price: '', pages: '1..7'}
13
- args.each do |arg|
14
- begin
15
- match = /(?<key>.*?)=(?<value>.*)/.match(arg)
16
- fail unless VALID_ARGS.include?(match[:key].to_sym)
17
- value = check(match)
18
- @parameters[match[:key].to_sym] = value
19
- rescue StandardError
20
- abort "invalid usage...\n" << usage << "\n\n"
21
- end
22
- end
23
- end # end validate_args
24
-
25
- def check(match)
26
- value = match[:value]
27
- fail unless value =~ /^(>|<|>=|<=|==)\d*.\d*?$/ if match[:key].to_sym.eql?(:price)
28
- # Float(value) if match[:key].to_sym.eql?(:price)
29
- fail unless value =~ /^\d*([.]{2}\d*)?$/ if match[:key].to_sym.eql?(:pages)
30
- value
31
- rescue StandardError
32
- abort "invalid parameters"
33
- end
34
-
35
- def pages
36
- first_page = @parameters[:pages].scan(/\d+/).first.to_i
37
- last_page = @parameters[:pages].scan(/\d+/).last.to_i
38
- @pages = *(first_page..last_page)
39
- end
40
-
41
- def usage
42
- 'Usage: queenshop [options]
43
- item=(string)
44
- price=(float[,float])
45
- examples:
46
- queenshop item="blouse" price=300
47
- queenshop price=0,100
48
- queenshop item="skirt"'
49
- end
50
- end
51
-
52
- class QConfig
53
- include Validate
54
- def initialize (args)
55
- validate_args (args)
56
- pages
57
- end
58
- end