queenshop 0.0.8 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d9d7d060414ffe7379cdc099d6a036208b41ce11
4
- data.tar.gz: 51d806387e05f7247e5b364ec824a85daee6e60d
3
+ metadata.gz: 92d0b55a68d535fb78b45df74784fa6a0dc4ea76
4
+ data.tar.gz: a6fdda4008a9620f139f48e33739d0240a0d72c8
5
5
  SHA512:
6
- metadata.gz: e466143cb2b9499c8a9655c19519294754be940c149e702218a1cd7960ae26164cf69eff0720a0c486c7bf3b89faee8e708f7f2e17ebcd0247efa97eac04ae9b
7
- data.tar.gz: aba290a33903378a1459c78d5e729bf1c6710c6bc3bf9748ff17939e2308821ed37ffaf0cfff4ad76819bcb3818414739c2bc68e995990e4c0dc06cd379da3de
6
+ metadata.gz: 739665cda25b1a572ce0d788e2402cd2ef2bf3ce577a883c13403716006cf93517294f80de81a43a78b6101b28174b2514521f5c56593671b751671def7de2ea
7
+ data.tar.gz: 50128c72758a26f8b860862985cb2466556aa4b13d39be71b0e2184001cf2b6a08efd134f020988b7cc6504039238a3a3f1730013e20f0bd8b5db5f33375be01
@@ -2,5 +2,34 @@
2
2
  # require 'queenshop' # for production
3
3
  require_relative '../lib/queenshop.rb' # for testing
4
4
 
5
- scraper = QueenShopScraper::Filter.new
6
- puts scraper.scrape(ARGV)
5
+ @scraper = QueenShop::Scraper.new
6
+
7
+ # command type keyword lprice hprice page_limit
8
+ def parse_args argv
9
+ input_length = argv.length
10
+ abort 'invalid usage' unless input_length <= 5
11
+
12
+ if input_length == 0 # scrape main category
13
+ @scraper.scrape('latest')
14
+ elsif input_length == 1 # scrape main category
15
+ @scraper.scrape(argv[0])
16
+ elsif input_length == 2
17
+ t = argv[1].to_i
18
+ if t != 0
19
+ options = { page_limit: argv[1] }
20
+ else
21
+ options = { keyword: argv[1] }
22
+ end
23
+ @scraper.scrape(argv[0], options)
24
+ elsif input_length == 3
25
+ options = { keyword: argv[1], page_limit: argv[2] }
26
+ @scraper.scrape(argv[0], options)
27
+ elsif input_length == 5
28
+ options = { keyword: argv[2], page_limit: argv[5],
29
+ price_boundary: [argv[3], argv[4]]
30
+ }
31
+ @scraper.scrape_filter(argv[0], options)
32
+ end
33
+ end
34
+
35
+ puts parse_args ARGV
@@ -1,3 +1,2 @@
1
1
  #!/usr/bin/env ruby
2
- require_relative 'queenshop/config'
3
2
  require_relative 'queenshop/scraper'
@@ -2,82 +2,171 @@
2
2
  require 'oga'
3
3
  require 'iconv'
4
4
  require 'open-uri'
5
- require_relative './config'
6
5
 
7
6
  # scrape data
8
- module QueenShopScraper
9
- # filter class basically uses xpath selectors to get attribs
10
- class Filter
11
- attr_reader :result
12
- attr_writer :item_selector
13
- attr_writer :title_selector
14
- attr_writer :price_selector
15
- attr_writer :site_url
7
+ module QueenShop
8
+ # extract_data class uses xpath selectors to get attribs
9
+ class Scraper
10
+ BASE_URL = 'https://queenshop.com.tw'
11
+ BASE_SCRAPE_URL = "#{BASE_URL}/m/PDList2.asp?"
12
+
13
+ LATEST_URI = "#{BASE_SCRAPE_URL}item1=new"
14
+ DISCOUNT_URI = "#{BASE_SCRAPE_URL}item1=dis"
15
+ POPULAR_URI = "#{BASE_SCRAPE_URL}item1=pre"
16
+ TOPS_URI = "#{BASE_SCRAPE_URL}brand=01&item1=00&item2=6"
17
+ PANTS_URI = "#{BASE_SCRAPE_URL}brand=01&item1=01&item2=3"
18
+ ACCESSORIES_URI = "#{BASE_SCRAPE_URL}brand=01&item1=02&item2=2"
19
+
20
+ # xml selectors that will be used to scrape data
21
+ ITEM_SELECTOR = "//div[@class='pditem']/div[@class='pdicon']"
22
+ TITLE_SELECTOR = "div[@class='pdicon_name']/a"
23
+ IMAGE_SELECTOR = "div[@class='pdicon_img']/a/img/@src"
24
+ PRICE_SELECTOR = "div[@class='pdicon_price']/div[@style='font-weight:bold;']"
25
+ LINK_SELECTOR = "div[@class='pdicon_name']/a/@href"
26
+ PAGES_SELECTOR = "div[@class='divPageClone']/a/@href"
27
+
28
+ def latest(page, options = {})
29
+ uri = uri_with_options(build_uri(LATEST_URI, options), page)
30
+ process_request(uri, options)
31
+ end
32
+
33
+ def popular(page, options = {})
34
+ uri = uri_with_options(build_uri(LATEST_URI, options), page)
35
+ process_request(uri, options)
36
+ end
37
+
38
+ def tops(page, options = {})
39
+ uri = uri_with_options(build_uri(LATEST_URI, options), page)
40
+ process_request(uri, options)
41
+ end
42
+
43
+ def pants(page, options = {})
44
+ uri = uri_with_options(build_uri(LATEST_URI, options), page)
45
+ process_request(uri, options)
46
+ end
47
+
48
+ def accessories(page, options = {})
49
+ uri = uri_with_options(build_uri(LATEST_URI, options), page)
50
+ process_request(uri, options)
51
+ end
52
+
53
+ def search(page, options = {})
54
+ uri = uri_with_options(build_uri(BASE_SCRAPE_URL, options), page)
55
+ process_request(uri, options)
56
+ end
57
+
58
+ def scrape(type, options = {})
59
+ records = []
60
+ valid_args = [:tops, :popular, :pants, :pants,
61
+ :accessories, :latest, :search]
62
+ abort 'invalid parameter - scrape type' unless valid_args.include?(type.to_sym)
63
+ scrape_what(type, options)
64
+ end
16
65
 
17
66
  private
18
67
 
19
- def get_xmldata(url)
20
- raw_html = open(url)
21
- ic = Iconv.new('UTF-8','big5')
22
- data = ic.iconv(raw_html.read)
23
- Oga.parse_html(data)
24
- rescue StandardError
25
- 'error'
26
- end
27
-
28
- def fetch_result(uri = '')
29
- url = @site_url + uri
30
- # try to open the url
31
- document = get_xmldata(url)
32
- # hard return on an error
33
- return [] unless document != 'error'
34
-
35
- items = document.xpath(@item_selector)
36
- # loop through the items and get the title and price
37
- items.map do |item|
38
- title = item.xpath(@title_selector).text()
39
- price = item.xpath(@price_selector).text
40
- strip_filter(title, price) if title.downcase.include? @item_filter
68
+ def process_request(uri, options)
69
+ body = open_uri(uri)
70
+ data = extract_data(body)
71
+ filter(data, options)
72
+ end
73
+
74
+ # filter by price if the options are not empty
75
+ def filter(data, options)
76
+ results = data
77
+ unless options.empty?
78
+ results = match_price(results, options[:price_boundary]) if options[:price_boundary]
41
79
  end
42
- @result
80
+ results
43
81
  end
44
82
 
45
- def strip_filter (title, price)
46
- price = price.gsub!(/NT. /, '')
47
- if !@price_filter.empty?
48
- if eval("#{price} #{@price_filter}")
49
- @result << { title: "#{title}", price: "#{price}" }
50
- end
51
- else
52
- @result << { title: "#{title}", price: "#{price}" } unless title.empty?
83
+ # do the actual extraction of prices from the result set
84
+ def match_price(data, boundary)
85
+ lower_bound = boundary.first || 0
86
+ upper_bound = boundary.last || Float::INFINITY
87
+
88
+ data.select { |item| lower_bound <= item[:price] && item[:price] <= upper_bound }
89
+ end
90
+
91
+ def build_uri(uri, options = {})
92
+ opts = { uri: uri }
93
+ unless options.empty?
94
+ opts[:keyword] = options[:keyword] if options[:keyword]
53
95
  end
96
+ opts
97
+ end
54
98
 
99
+ def uri_with_options(options = {}, page)
100
+ uri = ''
101
+ unless options.empty?
102
+ kw = options[:keyword] || nil
103
+ ic = Iconv.new('big5','UTF-8')
104
+ keyword = ic.iconv(kw)
105
+ uri << "#{options[:uri]}&pageno=#{page}" if options[:uri]
106
+ uri << "br=X&keyword=#{URI.escape(keyword)}" if options[:keyword]
107
+ end
108
+ uri
55
109
  end
56
110
 
57
- public
111
+ # try open the URL, fail on error
112
+ def open_uri(uri)
113
+ open(uri) {|file| file.read}
114
+ rescue StandardError
115
+ 'error opening site url'
116
+ end
58
117
 
59
- def initialize
60
- @result = []
61
- # xml selectors that will be used to scrape data
62
- @item_selector = "//div[@class=\'pditem\']/div[@class=\'pdicon\']"
63
- @title_selector = "div[@class=\'pdicon_name\']/a"
64
- @price_selector = "div[@class=\'pdicon_price\']/div[@style=\'font-weight:bold;\']"
65
- @site_url = 'https://www.queenshop.com.tw/m/PDList2.asp?'
66
- @price_filter = nil
118
+ # iterate over every element of item using xpath
119
+ def extract_data(raw)
120
+ Oga.parse_html(raw)
121
+ .xpath(ITEM_SELECTOR)
122
+ .map { |item| parse(item) }
67
123
  end
68
124
 
69
- def scrape (params=[])
70
- params = ARGV.empty? ? params : ARGV
71
- conf = QConfig.new(params)
72
- @price_filter = conf.parameters[:price]
73
- @item_filter = conf.parameters[:item].downcase
125
+ # call methods to extract the data using xpath
126
+ def parse(item)
127
+ {
128
+ title: extract_title(item),
129
+ price: extract_price(item),
130
+ images: extract_images(item),
131
+ link: extract_link(item)
132
+ }
133
+ end
74
134
 
75
- conf.pages.map do |page|
76
- paginated_uri = "&page=#{page}"
77
- fetch_result(paginated_uri)
78
- end
79
- @result
135
+ # Iconv is neccessary here otherwise text is unreadable
136
+ def extract_title(item)
137
+ ic = Iconv.new('UTF-8','big5')
138
+ raw_title = item.xpath(TITLE_SELECTOR).text
139
+ ic.iconv(raw_title)
80
140
  end
81
141
 
142
+ # get rid of the NT and convert to integer
143
+ def extract_price(item)
144
+ item.xpath(PRICE_SELECTOR).text.sub(/NT. /, '').to_i
145
+ end
146
+
147
+ # extract two images and return array or urls
148
+ def extract_images(item)
149
+ image = item.xpath(IMAGE_SELECTOR).text
150
+ image_hover = image.sub(/\.jpg/, '-h.jpg')
151
+ image_hover = image.sub(/\.png/, '-h.png') unless image_hover != image
152
+ ["#{BASE_URL}#{image}", "#{BASE_URL}#{image_hover}"]
153
+ end
154
+
155
+ # get the link to the item
156
+ def extract_link(item)
157
+ "#{BASE_URL}/#{item.xpath(LINK_SELECTOR).text}"
158
+ end
159
+
160
+ def scrape_what(type, options)
161
+ records = []
162
+ pl = options[:page_limit].to_i
163
+ page_limit = pl != 0 ? pl : 5
164
+
165
+ 1.upto(page_limit) do |page|
166
+ method = self.method(type)
167
+ records.push(method.call(page, options))
168
+ end
169
+ records
170
+ end
82
171
  end
83
172
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: queenshop
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Even Chang
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2015-11-22 00:00:00.000000000 Z
14
+ date: 2016-01-02 00:00:00.000000000 Z
15
15
  dependencies: []
16
16
  description: This is a gem scraping queenshop's website and returns the items with
17
17
  corresponding prices
@@ -27,7 +27,6 @@ extra_rdoc_files: []
27
27
  files:
28
28
  - bin/queenshop
29
29
  - lib/queenshop.rb
30
- - lib/queenshop/config.rb
31
30
  - lib/queenshop/scraper.rb
32
31
  homepage: http://rubygems.org/gems/queenshop
33
32
  licenses:
@@ -1,58 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- # this class takes care of
4
- # parsing the parameters
5
- module Validate
6
- attr_reader :parameters
7
- attr_reader :pages
8
-
9
- VALID_ARGS = [:item, :price, :pages]
10
-
11
- def validate_args(args)
12
- @parameters = {item: '', price: '', pages: '1..7'}
13
- args.each do |arg|
14
- begin
15
- match = /(?<key>.*?)=(?<value>.*)/.match(arg)
16
- fail unless VALID_ARGS.include?(match[:key].to_sym)
17
- value = check(match)
18
- @parameters[match[:key].to_sym] = value
19
- rescue StandardError
20
- abort "invalid usage...\n" << usage << "\n\n"
21
- end
22
- end
23
- end # end validate_args
24
-
25
- def check(match)
26
- value = match[:value]
27
- fail unless value =~ /^(>|<|>=|<=|==)\d*.\d*?$/ if match[:key].to_sym.eql?(:price)
28
- # Float(value) if match[:key].to_sym.eql?(:price)
29
- fail unless value =~ /^\d*([.]{2}\d*)?$/ if match[:key].to_sym.eql?(:pages)
30
- value
31
- rescue StandardError
32
- abort "invalid parameters"
33
- end
34
-
35
- def pages
36
- first_page = @parameters[:pages].scan(/\d+/).first.to_i
37
- last_page = @parameters[:pages].scan(/\d+/).last.to_i
38
- @pages = *(first_page..last_page)
39
- end
40
-
41
- def usage
42
- 'Usage: queenshop [options]
43
- item=(string)
44
- price=(float[,float])
45
- examples:
46
- queenshop item="blouse" price=300
47
- queenshop price=0,100
48
- queenshop item="skirt"'
49
- end
50
- end
51
-
52
- class QConfig
53
- include Validate
54
- def initialize (args)
55
- validate_args (args)
56
- pages
57
- end
58
- end