queenshop 0.0.8 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/queenshop +31 -2
- data/lib/queenshop.rb +0 -1
- data/lib/queenshop/scraper.rb +148 -59
- metadata +2 -3
- data/lib/queenshop/config.rb +0 -58
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 92d0b55a68d535fb78b45df74784fa6a0dc4ea76
|
4
|
+
data.tar.gz: a6fdda4008a9620f139f48e33739d0240a0d72c8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 739665cda25b1a572ce0d788e2402cd2ef2bf3ce577a883c13403716006cf93517294f80de81a43a78b6101b28174b2514521f5c56593671b751671def7de2ea
|
7
|
+
data.tar.gz: 50128c72758a26f8b860862985cb2466556aa4b13d39be71b0e2184001cf2b6a08efd134f020988b7cc6504039238a3a3f1730013e20f0bd8b5db5f33375be01
|
data/bin/queenshop
CHANGED
@@ -2,5 +2,34 @@
|
|
2
2
|
# require 'queenshop' # for production
|
3
3
|
require_relative '../lib/queenshop.rb' # for testing
|
4
4
|
|
5
|
-
scraper =
|
6
|
-
|
5
|
+
@scraper = QueenShop::Scraper.new
|
6
|
+
|
7
|
+
# command type keyword lprice hprice page_limit
|
8
|
+
def parse_args argv
|
9
|
+
input_length = argv.length
|
10
|
+
abort 'invalid usage' unless input_length <= 5
|
11
|
+
|
12
|
+
if input_length == 0 # scrape main category
|
13
|
+
@scraper.scrape('latest')
|
14
|
+
elsif input_length == 1 # scrape main category
|
15
|
+
@scraper.scrape(argv[0])
|
16
|
+
elsif input_length == 2
|
17
|
+
t = argv[1].to_i
|
18
|
+
if t != 0
|
19
|
+
options = { page_limit: argv[1] }
|
20
|
+
else
|
21
|
+
options = { keyword: argv[1] }
|
22
|
+
end
|
23
|
+
@scraper.scrape(argv[0], options)
|
24
|
+
elsif input_length == 3
|
25
|
+
options = { keyword: argv[1], page_limit: argv[2] }
|
26
|
+
@scraper.scrape(argv[0], options)
|
27
|
+
elsif input_length == 5
|
28
|
+
options = { keyword: argv[2], page_limit: argv[5],
|
29
|
+
price_boundary: [argv[3], argv[4]]
|
30
|
+
}
|
31
|
+
@scraper.scrape_filter(argv[0], options)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
puts parse_args ARGV
|
data/lib/queenshop.rb
CHANGED
data/lib/queenshop/scraper.rb
CHANGED
@@ -2,82 +2,171 @@
|
|
2
2
|
require 'oga'
|
3
3
|
require 'iconv'
|
4
4
|
require 'open-uri'
|
5
|
-
require_relative './config'
|
6
5
|
|
7
6
|
# scrape data
|
8
|
-
module
|
9
|
-
#
|
10
|
-
class
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
7
|
+
module QueenShop
|
8
|
+
# extract_data class uses xpath selectors to get attribs
|
9
|
+
class Scraper
|
10
|
+
BASE_URL = 'https://queenshop.com.tw'
|
11
|
+
BASE_SCRAPE_URL = "#{BASE_URL}/m/PDList2.asp?"
|
12
|
+
|
13
|
+
LATEST_URI = "#{BASE_SCRAPE_URL}item1=new"
|
14
|
+
DISCOUNT_URI = "#{BASE_SCRAPE_URL}item1=dis"
|
15
|
+
POPULAR_URI = "#{BASE_SCRAPE_URL}item1=pre"
|
16
|
+
TOPS_URI = "#{BASE_SCRAPE_URL}brand=01&item1=00&item2=6"
|
17
|
+
PANTS_URI = "#{BASE_SCRAPE_URL}brand=01&item1=01&item2=3"
|
18
|
+
ACCESSORIES_URI = "#{BASE_SCRAPE_URL}brand=01&item1=02&item2=2"
|
19
|
+
|
20
|
+
# xml selectors that will be used to scrape data
|
21
|
+
ITEM_SELECTOR = "//div[@class='pditem']/div[@class='pdicon']"
|
22
|
+
TITLE_SELECTOR = "div[@class='pdicon_name']/a"
|
23
|
+
IMAGE_SELECTOR = "div[@class='pdicon_img']/a/img/@src"
|
24
|
+
PRICE_SELECTOR = "div[@class='pdicon_price']/div[@style='font-weight:bold;']"
|
25
|
+
LINK_SELECTOR = "div[@class='pdicon_name']/a/@href"
|
26
|
+
PAGES_SELECTOR = "div[@class='divPageClone']/a/@href"
|
27
|
+
|
28
|
+
def latest(page, options = {})
|
29
|
+
uri = uri_with_options(build_uri(LATEST_URI, options), page)
|
30
|
+
process_request(uri, options)
|
31
|
+
end
|
32
|
+
|
33
|
+
def popular(page, options = {})
|
34
|
+
uri = uri_with_options(build_uri(LATEST_URI, options), page)
|
35
|
+
process_request(uri, options)
|
36
|
+
end
|
37
|
+
|
38
|
+
def tops(page, options = {})
|
39
|
+
uri = uri_with_options(build_uri(LATEST_URI, options), page)
|
40
|
+
process_request(uri, options)
|
41
|
+
end
|
42
|
+
|
43
|
+
def pants(page, options = {})
|
44
|
+
uri = uri_with_options(build_uri(LATEST_URI, options), page)
|
45
|
+
process_request(uri, options)
|
46
|
+
end
|
47
|
+
|
48
|
+
def accessories(page, options = {})
|
49
|
+
uri = uri_with_options(build_uri(LATEST_URI, options), page)
|
50
|
+
process_request(uri, options)
|
51
|
+
end
|
52
|
+
|
53
|
+
def search(page, options = {})
|
54
|
+
uri = uri_with_options(build_uri(BASE_SCRAPE_URL, options), page)
|
55
|
+
process_request(uri, options)
|
56
|
+
end
|
57
|
+
|
58
|
+
def scrape(type, options = {})
|
59
|
+
records = []
|
60
|
+
valid_args = [:tops, :popular, :pants, :pants,
|
61
|
+
:accessories, :latest, :search]
|
62
|
+
abort 'invalid parameter - scrape type' unless valid_args.include?(type.to_sym)
|
63
|
+
scrape_what(type, options)
|
64
|
+
end
|
16
65
|
|
17
66
|
private
|
18
67
|
|
19
|
-
def
|
20
|
-
|
21
|
-
|
22
|
-
data
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
# try to open the url
|
31
|
-
document = get_xmldata(url)
|
32
|
-
# hard return on an error
|
33
|
-
return [] unless document != 'error'
|
34
|
-
|
35
|
-
items = document.xpath(@item_selector)
|
36
|
-
# loop through the items and get the title and price
|
37
|
-
items.map do |item|
|
38
|
-
title = item.xpath(@title_selector).text()
|
39
|
-
price = item.xpath(@price_selector).text
|
40
|
-
strip_filter(title, price) if title.downcase.include? @item_filter
|
68
|
+
def process_request(uri, options)
|
69
|
+
body = open_uri(uri)
|
70
|
+
data = extract_data(body)
|
71
|
+
filter(data, options)
|
72
|
+
end
|
73
|
+
|
74
|
+
# filter by price if the options are not empty
|
75
|
+
def filter(data, options)
|
76
|
+
results = data
|
77
|
+
unless options.empty?
|
78
|
+
results = match_price(results, options[:price_boundary]) if options[:price_boundary]
|
41
79
|
end
|
42
|
-
|
80
|
+
results
|
43
81
|
end
|
44
82
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
83
|
+
# do the actual extraction of prices from the result set
|
84
|
+
def match_price(data, boundary)
|
85
|
+
lower_bound = boundary.first || 0
|
86
|
+
upper_bound = boundary.last || Float::INFINITY
|
87
|
+
|
88
|
+
data.select { |item| lower_bound <= item[:price] && item[:price] <= upper_bound }
|
89
|
+
end
|
90
|
+
|
91
|
+
def build_uri(uri, options = {})
|
92
|
+
opts = { uri: uri }
|
93
|
+
unless options.empty?
|
94
|
+
opts[:keyword] = options[:keyword] if options[:keyword]
|
53
95
|
end
|
96
|
+
opts
|
97
|
+
end
|
54
98
|
|
99
|
+
def uri_with_options(options = {}, page)
|
100
|
+
uri = ''
|
101
|
+
unless options.empty?
|
102
|
+
kw = options[:keyword] || nil
|
103
|
+
ic = Iconv.new('big5','UTF-8')
|
104
|
+
keyword = ic.iconv(kw)
|
105
|
+
uri << "#{options[:uri]}&pageno=#{page}" if options[:uri]
|
106
|
+
uri << "br=X&keyword=#{URI.escape(keyword)}" if options[:keyword]
|
107
|
+
end
|
108
|
+
uri
|
55
109
|
end
|
56
110
|
|
57
|
-
|
111
|
+
# try open the URL, fail on error
|
112
|
+
def open_uri(uri)
|
113
|
+
open(uri) {|file| file.read}
|
114
|
+
rescue StandardError
|
115
|
+
'error opening site url'
|
116
|
+
end
|
58
117
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
@price_selector = "div[@class=\'pdicon_price\']/div[@style=\'font-weight:bold;\']"
|
65
|
-
@site_url = 'https://www.queenshop.com.tw/m/PDList2.asp?'
|
66
|
-
@price_filter = nil
|
118
|
+
# iterate over every element of item using xpath
|
119
|
+
def extract_data(raw)
|
120
|
+
Oga.parse_html(raw)
|
121
|
+
.xpath(ITEM_SELECTOR)
|
122
|
+
.map { |item| parse(item) }
|
67
123
|
end
|
68
124
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
125
|
+
# call methods to extract the data using xpath
|
126
|
+
def parse(item)
|
127
|
+
{
|
128
|
+
title: extract_title(item),
|
129
|
+
price: extract_price(item),
|
130
|
+
images: extract_images(item),
|
131
|
+
link: extract_link(item)
|
132
|
+
}
|
133
|
+
end
|
74
134
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
135
|
+
# Iconv is neccessary here otherwise text is unreadable
|
136
|
+
def extract_title(item)
|
137
|
+
ic = Iconv.new('UTF-8','big5')
|
138
|
+
raw_title = item.xpath(TITLE_SELECTOR).text
|
139
|
+
ic.iconv(raw_title)
|
80
140
|
end
|
81
141
|
|
142
|
+
# get rid of the NT and convert to integer
|
143
|
+
def extract_price(item)
|
144
|
+
item.xpath(PRICE_SELECTOR).text.sub(/NT. /, '').to_i
|
145
|
+
end
|
146
|
+
|
147
|
+
# extract two images and return array or urls
|
148
|
+
def extract_images(item)
|
149
|
+
image = item.xpath(IMAGE_SELECTOR).text
|
150
|
+
image_hover = image.sub(/\.jpg/, '-h.jpg')
|
151
|
+
image_hover = image.sub(/\.png/, '-h.png') unless image_hover != image
|
152
|
+
["#{BASE_URL}#{image}", "#{BASE_URL}#{image_hover}"]
|
153
|
+
end
|
154
|
+
|
155
|
+
# get the link to the item
|
156
|
+
def extract_link(item)
|
157
|
+
"#{BASE_URL}/#{item.xpath(LINK_SELECTOR).text}"
|
158
|
+
end
|
159
|
+
|
160
|
+
def scrape_what(type, options)
|
161
|
+
records = []
|
162
|
+
pl = options[:page_limit].to_i
|
163
|
+
page_limit = pl != 0 ? pl : 5
|
164
|
+
|
165
|
+
1.upto(page_limit) do |page|
|
166
|
+
method = self.method(type)
|
167
|
+
records.push(method.call(page, options))
|
168
|
+
end
|
169
|
+
records
|
170
|
+
end
|
82
171
|
end
|
83
172
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: queenshop
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Even Chang
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date:
|
14
|
+
date: 2016-01-02 00:00:00.000000000 Z
|
15
15
|
dependencies: []
|
16
16
|
description: This is a gem scraping queenshop's website and returns the items with
|
17
17
|
corresponding prices
|
@@ -27,7 +27,6 @@ extra_rdoc_files: []
|
|
27
27
|
files:
|
28
28
|
- bin/queenshop
|
29
29
|
- lib/queenshop.rb
|
30
|
-
- lib/queenshop/config.rb
|
31
30
|
- lib/queenshop/scraper.rb
|
32
31
|
homepage: http://rubygems.org/gems/queenshop
|
33
32
|
licenses:
|
data/lib/queenshop/config.rb
DELETED
@@ -1,58 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
# this class takes care of
|
4
|
-
# parsing the parameters
|
5
|
-
module Validate
|
6
|
-
attr_reader :parameters
|
7
|
-
attr_reader :pages
|
8
|
-
|
9
|
-
VALID_ARGS = [:item, :price, :pages]
|
10
|
-
|
11
|
-
def validate_args(args)
|
12
|
-
@parameters = {item: '', price: '', pages: '1..7'}
|
13
|
-
args.each do |arg|
|
14
|
-
begin
|
15
|
-
match = /(?<key>.*?)=(?<value>.*)/.match(arg)
|
16
|
-
fail unless VALID_ARGS.include?(match[:key].to_sym)
|
17
|
-
value = check(match)
|
18
|
-
@parameters[match[:key].to_sym] = value
|
19
|
-
rescue StandardError
|
20
|
-
abort "invalid usage...\n" << usage << "\n\n"
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end # end validate_args
|
24
|
-
|
25
|
-
def check(match)
|
26
|
-
value = match[:value]
|
27
|
-
fail unless value =~ /^(>|<|>=|<=|==)\d*.\d*?$/ if match[:key].to_sym.eql?(:price)
|
28
|
-
# Float(value) if match[:key].to_sym.eql?(:price)
|
29
|
-
fail unless value =~ /^\d*([.]{2}\d*)?$/ if match[:key].to_sym.eql?(:pages)
|
30
|
-
value
|
31
|
-
rescue StandardError
|
32
|
-
abort "invalid parameters"
|
33
|
-
end
|
34
|
-
|
35
|
-
def pages
|
36
|
-
first_page = @parameters[:pages].scan(/\d+/).first.to_i
|
37
|
-
last_page = @parameters[:pages].scan(/\d+/).last.to_i
|
38
|
-
@pages = *(first_page..last_page)
|
39
|
-
end
|
40
|
-
|
41
|
-
def usage
|
42
|
-
'Usage: queenshop [options]
|
43
|
-
item=(string)
|
44
|
-
price=(float[,float])
|
45
|
-
examples:
|
46
|
-
queenshop item="blouse" price=300
|
47
|
-
queenshop price=0,100
|
48
|
-
queenshop item="skirt"'
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
class QConfig
|
53
|
-
include Validate
|
54
|
-
def initialize (args)
|
55
|
-
validate_args (args)
|
56
|
-
pages
|
57
|
-
end
|
58
|
-
end
|