queenshop 0.0.8 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/queenshop +31 -2
- data/lib/queenshop.rb +0 -1
- data/lib/queenshop/scraper.rb +148 -59
- metadata +2 -3
- data/lib/queenshop/config.rb +0 -58
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 92d0b55a68d535fb78b45df74784fa6a0dc4ea76
|
4
|
+
data.tar.gz: a6fdda4008a9620f139f48e33739d0240a0d72c8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 739665cda25b1a572ce0d788e2402cd2ef2bf3ce577a883c13403716006cf93517294f80de81a43a78b6101b28174b2514521f5c56593671b751671def7de2ea
|
7
|
+
data.tar.gz: 50128c72758a26f8b860862985cb2466556aa4b13d39be71b0e2184001cf2b6a08efd134f020988b7cc6504039238a3a3f1730013e20f0bd8b5db5f33375be01
|
data/bin/queenshop
CHANGED
@@ -2,5 +2,34 @@
|
|
2
2
|
# require 'queenshop' # for production
|
3
3
|
require_relative '../lib/queenshop.rb' # for testing
|
4
4
|
|
5
|
-
scraper =
|
6
|
-
|
5
|
+
@scraper = QueenShop::Scraper.new
|
6
|
+
|
7
|
+
# command type keyword lprice hprice page_limit
|
8
|
+
def parse_args argv
|
9
|
+
input_length = argv.length
|
10
|
+
abort 'invalid usage' unless input_length <= 5
|
11
|
+
|
12
|
+
if input_length == 0 # scrape main category
|
13
|
+
@scraper.scrape('latest')
|
14
|
+
elsif input_length == 1 # scrape main category
|
15
|
+
@scraper.scrape(argv[0])
|
16
|
+
elsif input_length == 2
|
17
|
+
t = argv[1].to_i
|
18
|
+
if t != 0
|
19
|
+
options = { page_limit: argv[1] }
|
20
|
+
else
|
21
|
+
options = { keyword: argv[1] }
|
22
|
+
end
|
23
|
+
@scraper.scrape(argv[0], options)
|
24
|
+
elsif input_length == 3
|
25
|
+
options = { keyword: argv[1], page_limit: argv[2] }
|
26
|
+
@scraper.scrape(argv[0], options)
|
27
|
+
elsif input_length == 5
|
28
|
+
options = { keyword: argv[2], page_limit: argv[5],
|
29
|
+
price_boundary: [argv[3], argv[4]]
|
30
|
+
}
|
31
|
+
@scraper.scrape_filter(argv[0], options)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
puts parse_args ARGV
|
data/lib/queenshop.rb
CHANGED
data/lib/queenshop/scraper.rb
CHANGED
@@ -2,82 +2,171 @@
|
|
2
2
|
require 'oga'
|
3
3
|
require 'iconv'
|
4
4
|
require 'open-uri'
|
5
|
-
require_relative './config'
|
6
5
|
|
7
6
|
# scrape data
|
8
|
-
module
|
9
|
-
#
|
10
|
-
class
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
7
|
+
module QueenShop
|
8
|
+
# extract_data class uses xpath selectors to get attribs
|
9
|
+
class Scraper
|
10
|
+
BASE_URL = 'https://queenshop.com.tw'
|
11
|
+
BASE_SCRAPE_URL = "#{BASE_URL}/m/PDList2.asp?"
|
12
|
+
|
13
|
+
LATEST_URI = "#{BASE_SCRAPE_URL}item1=new"
|
14
|
+
DISCOUNT_URI = "#{BASE_SCRAPE_URL}item1=dis"
|
15
|
+
POPULAR_URI = "#{BASE_SCRAPE_URL}item1=pre"
|
16
|
+
TOPS_URI = "#{BASE_SCRAPE_URL}brand=01&item1=00&item2=6"
|
17
|
+
PANTS_URI = "#{BASE_SCRAPE_URL}brand=01&item1=01&item2=3"
|
18
|
+
ACCESSORIES_URI = "#{BASE_SCRAPE_URL}brand=01&item1=02&item2=2"
|
19
|
+
|
20
|
+
# xml selectors that will be used to scrape data
|
21
|
+
ITEM_SELECTOR = "//div[@class='pditem']/div[@class='pdicon']"
|
22
|
+
TITLE_SELECTOR = "div[@class='pdicon_name']/a"
|
23
|
+
IMAGE_SELECTOR = "div[@class='pdicon_img']/a/img/@src"
|
24
|
+
PRICE_SELECTOR = "div[@class='pdicon_price']/div[@style='font-weight:bold;']"
|
25
|
+
LINK_SELECTOR = "div[@class='pdicon_name']/a/@href"
|
26
|
+
PAGES_SELECTOR = "div[@class='divPageClone']/a/@href"
|
27
|
+
|
28
|
+
def latest(page, options = {})
|
29
|
+
uri = uri_with_options(build_uri(LATEST_URI, options), page)
|
30
|
+
process_request(uri, options)
|
31
|
+
end
|
32
|
+
|
33
|
+
def popular(page, options = {})
|
34
|
+
uri = uri_with_options(build_uri(LATEST_URI, options), page)
|
35
|
+
process_request(uri, options)
|
36
|
+
end
|
37
|
+
|
38
|
+
def tops(page, options = {})
|
39
|
+
uri = uri_with_options(build_uri(LATEST_URI, options), page)
|
40
|
+
process_request(uri, options)
|
41
|
+
end
|
42
|
+
|
43
|
+
def pants(page, options = {})
|
44
|
+
uri = uri_with_options(build_uri(LATEST_URI, options), page)
|
45
|
+
process_request(uri, options)
|
46
|
+
end
|
47
|
+
|
48
|
+
def accessories(page, options = {})
|
49
|
+
uri = uri_with_options(build_uri(LATEST_URI, options), page)
|
50
|
+
process_request(uri, options)
|
51
|
+
end
|
52
|
+
|
53
|
+
def search(page, options = {})
|
54
|
+
uri = uri_with_options(build_uri(BASE_SCRAPE_URL, options), page)
|
55
|
+
process_request(uri, options)
|
56
|
+
end
|
57
|
+
|
58
|
+
def scrape(type, options = {})
|
59
|
+
records = []
|
60
|
+
valid_args = [:tops, :popular, :pants, :pants,
|
61
|
+
:accessories, :latest, :search]
|
62
|
+
abort 'invalid parameter - scrape type' unless valid_args.include?(type.to_sym)
|
63
|
+
scrape_what(type, options)
|
64
|
+
end
|
16
65
|
|
17
66
|
private
|
18
67
|
|
19
|
-
def
|
20
|
-
|
21
|
-
|
22
|
-
data
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
# try to open the url
|
31
|
-
document = get_xmldata(url)
|
32
|
-
# hard return on an error
|
33
|
-
return [] unless document != 'error'
|
34
|
-
|
35
|
-
items = document.xpath(@item_selector)
|
36
|
-
# loop through the items and get the title and price
|
37
|
-
items.map do |item|
|
38
|
-
title = item.xpath(@title_selector).text()
|
39
|
-
price = item.xpath(@price_selector).text
|
40
|
-
strip_filter(title, price) if title.downcase.include? @item_filter
|
68
|
+
def process_request(uri, options)
|
69
|
+
body = open_uri(uri)
|
70
|
+
data = extract_data(body)
|
71
|
+
filter(data, options)
|
72
|
+
end
|
73
|
+
|
74
|
+
# filter by price if the options are not empty
|
75
|
+
def filter(data, options)
|
76
|
+
results = data
|
77
|
+
unless options.empty?
|
78
|
+
results = match_price(results, options[:price_boundary]) if options[:price_boundary]
|
41
79
|
end
|
42
|
-
|
80
|
+
results
|
43
81
|
end
|
44
82
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
83
|
+
# do the actual extraction of prices from the result set
|
84
|
+
def match_price(data, boundary)
|
85
|
+
lower_bound = boundary.first || 0
|
86
|
+
upper_bound = boundary.last || Float::INFINITY
|
87
|
+
|
88
|
+
data.select { |item| lower_bound <= item[:price] && item[:price] <= upper_bound }
|
89
|
+
end
|
90
|
+
|
91
|
+
def build_uri(uri, options = {})
|
92
|
+
opts = { uri: uri }
|
93
|
+
unless options.empty?
|
94
|
+
opts[:keyword] = options[:keyword] if options[:keyword]
|
53
95
|
end
|
96
|
+
opts
|
97
|
+
end
|
54
98
|
|
99
|
+
def uri_with_options(options = {}, page)
|
100
|
+
uri = ''
|
101
|
+
unless options.empty?
|
102
|
+
kw = options[:keyword] || nil
|
103
|
+
ic = Iconv.new('big5','UTF-8')
|
104
|
+
keyword = ic.iconv(kw)
|
105
|
+
uri << "#{options[:uri]}&pageno=#{page}" if options[:uri]
|
106
|
+
uri << "br=X&keyword=#{URI.escape(keyword)}" if options[:keyword]
|
107
|
+
end
|
108
|
+
uri
|
55
109
|
end
|
56
110
|
|
57
|
-
|
111
|
+
# try open the URL, fail on error
|
112
|
+
def open_uri(uri)
|
113
|
+
open(uri) {|file| file.read}
|
114
|
+
rescue StandardError
|
115
|
+
'error opening site url'
|
116
|
+
end
|
58
117
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
@price_selector = "div[@class=\'pdicon_price\']/div[@style=\'font-weight:bold;\']"
|
65
|
-
@site_url = 'https://www.queenshop.com.tw/m/PDList2.asp?'
|
66
|
-
@price_filter = nil
|
118
|
+
# iterate over every element of item using xpath
|
119
|
+
def extract_data(raw)
|
120
|
+
Oga.parse_html(raw)
|
121
|
+
.xpath(ITEM_SELECTOR)
|
122
|
+
.map { |item| parse(item) }
|
67
123
|
end
|
68
124
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
125
|
+
# call methods to extract the data using xpath
|
126
|
+
def parse(item)
|
127
|
+
{
|
128
|
+
title: extract_title(item),
|
129
|
+
price: extract_price(item),
|
130
|
+
images: extract_images(item),
|
131
|
+
link: extract_link(item)
|
132
|
+
}
|
133
|
+
end
|
74
134
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
135
|
+
# Iconv is neccessary here otherwise text is unreadable
|
136
|
+
def extract_title(item)
|
137
|
+
ic = Iconv.new('UTF-8','big5')
|
138
|
+
raw_title = item.xpath(TITLE_SELECTOR).text
|
139
|
+
ic.iconv(raw_title)
|
80
140
|
end
|
81
141
|
|
142
|
+
# get rid of the NT and convert to integer
|
143
|
+
def extract_price(item)
|
144
|
+
item.xpath(PRICE_SELECTOR).text.sub(/NT. /, '').to_i
|
145
|
+
end
|
146
|
+
|
147
|
+
# extract two images and return array or urls
|
148
|
+
def extract_images(item)
|
149
|
+
image = item.xpath(IMAGE_SELECTOR).text
|
150
|
+
image_hover = image.sub(/\.jpg/, '-h.jpg')
|
151
|
+
image_hover = image.sub(/\.png/, '-h.png') unless image_hover != image
|
152
|
+
["#{BASE_URL}#{image}", "#{BASE_URL}#{image_hover}"]
|
153
|
+
end
|
154
|
+
|
155
|
+
# get the link to the item
|
156
|
+
def extract_link(item)
|
157
|
+
"#{BASE_URL}/#{item.xpath(LINK_SELECTOR).text}"
|
158
|
+
end
|
159
|
+
|
160
|
+
def scrape_what(type, options)
|
161
|
+
records = []
|
162
|
+
pl = options[:page_limit].to_i
|
163
|
+
page_limit = pl != 0 ? pl : 5
|
164
|
+
|
165
|
+
1.upto(page_limit) do |page|
|
166
|
+
method = self.method(type)
|
167
|
+
records.push(method.call(page, options))
|
168
|
+
end
|
169
|
+
records
|
170
|
+
end
|
82
171
|
end
|
83
172
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: queenshop
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Even Chang
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date:
|
14
|
+
date: 2016-01-02 00:00:00.000000000 Z
|
15
15
|
dependencies: []
|
16
16
|
description: This is a gem scraping queenshop's website and returns the items with
|
17
17
|
corresponding prices
|
@@ -27,7 +27,6 @@ extra_rdoc_files: []
|
|
27
27
|
files:
|
28
28
|
- bin/queenshop
|
29
29
|
- lib/queenshop.rb
|
30
|
-
- lib/queenshop/config.rb
|
31
30
|
- lib/queenshop/scraper.rb
|
32
31
|
homepage: http://rubygems.org/gems/queenshop
|
33
32
|
licenses:
|
data/lib/queenshop/config.rb
DELETED
@@ -1,58 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
# this class takes care of
|
4
|
-
# parsing the parameters
|
5
|
-
module Validate
|
6
|
-
attr_reader :parameters
|
7
|
-
attr_reader :pages
|
8
|
-
|
9
|
-
VALID_ARGS = [:item, :price, :pages]
|
10
|
-
|
11
|
-
def validate_args(args)
|
12
|
-
@parameters = {item: '', price: '', pages: '1..7'}
|
13
|
-
args.each do |arg|
|
14
|
-
begin
|
15
|
-
match = /(?<key>.*?)=(?<value>.*)/.match(arg)
|
16
|
-
fail unless VALID_ARGS.include?(match[:key].to_sym)
|
17
|
-
value = check(match)
|
18
|
-
@parameters[match[:key].to_sym] = value
|
19
|
-
rescue StandardError
|
20
|
-
abort "invalid usage...\n" << usage << "\n\n"
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end # end validate_args
|
24
|
-
|
25
|
-
def check(match)
|
26
|
-
value = match[:value]
|
27
|
-
fail unless value =~ /^(>|<|>=|<=|==)\d*.\d*?$/ if match[:key].to_sym.eql?(:price)
|
28
|
-
# Float(value) if match[:key].to_sym.eql?(:price)
|
29
|
-
fail unless value =~ /^\d*([.]{2}\d*)?$/ if match[:key].to_sym.eql?(:pages)
|
30
|
-
value
|
31
|
-
rescue StandardError
|
32
|
-
abort "invalid parameters"
|
33
|
-
end
|
34
|
-
|
35
|
-
def pages
|
36
|
-
first_page = @parameters[:pages].scan(/\d+/).first.to_i
|
37
|
-
last_page = @parameters[:pages].scan(/\d+/).last.to_i
|
38
|
-
@pages = *(first_page..last_page)
|
39
|
-
end
|
40
|
-
|
41
|
-
def usage
|
42
|
-
'Usage: queenshop [options]
|
43
|
-
item=(string)
|
44
|
-
price=(float[,float])
|
45
|
-
examples:
|
46
|
-
queenshop item="blouse" price=300
|
47
|
-
queenshop price=0,100
|
48
|
-
queenshop item="skirt"'
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
class QConfig
|
53
|
-
include Validate
|
54
|
-
def initialize (args)
|
55
|
-
validate_args (args)
|
56
|
-
pages
|
57
|
-
end
|
58
|
-
end
|