joyceshop 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/joyceshop +31 -3
- data/lib/joyceshop/scraper.rb +100 -78
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 05a440353df86f1a449acbe8633aefd1139a0c55
|
4
|
+
data.tar.gz: 9cedcb43aca538bb30e5e1de8bf0a17d472e795e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e2a8e4481761300cc527fc1103c377a54a1d547ee5d92a3a5fb14ebd60438e10427d8458625ec8484da37f4cb6a5253b464c124ff6a13c7859e67e8de472abfb
|
7
|
+
data.tar.gz: 986e94aa57f5ebe8c9470491079c0452c0e06aaa2886d3e767c0191ac3e5c51f07d6b19b462c48b7b20c1a8a5717a4f8e87b8a57ae0fe037a345f23bdf0d9ae2
|
data/bin/joyceshop
CHANGED
@@ -2,6 +2,34 @@
|
|
2
2
|
# require 'joyceshop' # for production
|
3
3
|
require_relative '../lib/joyceshop.rb' # for testing
|
4
4
|
|
5
|
-
scraper = JoyceShop::Scraper.new
|
6
|
-
|
7
|
-
|
5
|
+
@scraper = JoyceShop::Scraper.new
|
6
|
+
|
7
|
+
# command type keyword lprice hprice page_limit
|
8
|
+
def parse_args argv
|
9
|
+
input_length = argv.length
|
10
|
+
abort 'invalid usage' unless input_length <= 5
|
11
|
+
|
12
|
+
if input_length == 0 # scrape main category
|
13
|
+
@scraper.scrape('latest')
|
14
|
+
elsif input_length == 1 # scrape main category
|
15
|
+
@scraper.scrape(argv[0])
|
16
|
+
elsif input_length == 2
|
17
|
+
t = argv[1].to_i
|
18
|
+
if t != 0
|
19
|
+
options = { page_limit: argv[1] }
|
20
|
+
else
|
21
|
+
options = { keyword: argv[1] }
|
22
|
+
end
|
23
|
+
@scraper.scrape(argv[0], options)
|
24
|
+
elsif input_length == 3
|
25
|
+
options = { keyword: argv[1], page_limit: argv[2] }
|
26
|
+
@scraper.scrape(argv[0], options)
|
27
|
+
elsif input_length == 5
|
28
|
+
options = { keyword: argv[2], page_limit: argv[5],
|
29
|
+
price_boundary: [argv[3], argv[4]]
|
30
|
+
}
|
31
|
+
@scraper.scrape_filter(argv[0], options)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
puts parse_args ARGV
|
data/lib/joyceshop/scraper.rb
CHANGED
@@ -5,104 +5,84 @@ require 'open-uri'
|
|
5
5
|
|
6
6
|
# scrape data
|
7
7
|
module JoyceShop
|
8
|
+
# extract_data class uses xpath selectors to get attribs
|
8
9
|
class Scraper
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
#
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
#
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
@@ITEM_INFO_SELECTOR = "div[contains(@class, 'NEW_shop_list_info')]"
|
26
|
-
@@TITLE_SELECTOR = "#{@@ITEM_INFO_SELECTOR}/div[1]"
|
27
|
-
@@PRICE_SELECTOR = "#{@@ITEM_INFO_SELECTOR}/span"
|
10
|
+
BASE_URL = 'https://www.joyce-shop.com'
|
11
|
+
BASE_SCRAPE_URL = "#{BASE_URL}/PDList.asp?"
|
12
|
+
|
13
|
+
LATEST_URI = "#{BASE_SCRAPE_URL}brand=01&item1=&item2=&ya19=&keyword=&recommand=1412170001&ob=F"
|
14
|
+
POPULAR_URI = "#{BASE_SCRAPE_URL}brand=01&item1=&item2=&ya19=&keyword=&recommand=1305080002&ob=F"
|
15
|
+
TOPS_URI = "#{BASE_SCRAPE_URL}brand=01&item1=110&item2=111&ya19=&keyword=&recommand=&ob=F"
|
16
|
+
PANTS_URI = "#{BASE_SCRAPE_URL}brand=01&item1=120&item2=121&ya19=&keyword=&recommand=&ob=F"
|
17
|
+
ACCESSORIES_URI = "#{BASE_SCRAPE_URL}brand=01&item1=140&item2=141&ya19=&keyword=&recommand=&ob=F"
|
18
|
+
|
19
|
+
# xml selectors that will be used to scrape data
|
20
|
+
ITEM_SELECTOR = "//div[contains(@class, 'NEW_shop_list')]/ul/li/div[contains(@class, 'NEW_shop_list_pic')]"
|
21
|
+
ITEM_INFO_SELECTOR = "div[contains(@class, 'NEW_shop_list_info')]"
|
22
|
+
TITLE_SELECTOR = "#{ITEM_INFO_SELECTOR}/div[1]"
|
23
|
+
IMAGE_SELECTOR = "a/img[contains(@class, 'lazyload')]/@src"
|
24
|
+
PRICE_SELECTOR = "#{ITEM_INFO_SELECTOR}/span"
|
25
|
+
LINK_SELECTOR = "a[1]/@href"
|
28
26
|
|
29
27
|
# Regular
|
30
|
-
|
28
|
+
TITLE_REGEX = /([.\p{Han}[a-zA-Z]]+)/
|
31
29
|
|
32
|
-
def latest(page, options={})
|
33
|
-
uri =
|
34
|
-
|
35
|
-
data = parse_html(body)
|
36
|
-
filter(data, options)
|
30
|
+
def latest(page, options = {})
|
31
|
+
uri = uri_with_options(build_uri(LATEST_URI, options), page)
|
32
|
+
process_request(uri, options)
|
37
33
|
end
|
38
34
|
|
39
|
-
def popular(page, options={})
|
40
|
-
uri =
|
41
|
-
|
42
|
-
data = parse_html(body)
|
43
|
-
filter(data, options)
|
35
|
+
def popular(page, options = {})
|
36
|
+
uri = uri_with_options(build_uri(POPULAR_URI, options), page)
|
37
|
+
process_request(uri, options)
|
44
38
|
end
|
45
39
|
|
46
|
-
def tops(page, options={})
|
47
|
-
uri =
|
48
|
-
|
49
|
-
data = parse_html(body)
|
50
|
-
filter(data, options)
|
40
|
+
def tops(page, options = {})
|
41
|
+
uri = uri_with_options(build_uri(TOPS_URI, options), page)
|
42
|
+
process_request(uri, options)
|
51
43
|
end
|
52
44
|
|
53
|
-
def pants(page, options={})
|
54
|
-
uri =
|
55
|
-
|
56
|
-
data = parse_html(body)
|
57
|
-
filter(data, options)
|
45
|
+
def pants(page, options = {})
|
46
|
+
uri = uri_with_options(build_uri(PANTS_URI, options), page)
|
47
|
+
process_request(uri, options)
|
58
48
|
end
|
59
49
|
|
60
|
-
def accessories(page, options={})
|
61
|
-
uri =
|
62
|
-
|
63
|
-
data = parse_html(body)
|
64
|
-
filter(data, options)
|
50
|
+
def accessories(page, options = {})
|
51
|
+
uri = uri_with_options(build_uri(ACCESSORIES_URI, options), page)
|
52
|
+
process_request(uri, options)
|
65
53
|
end
|
66
54
|
|
67
|
-
def search(
|
68
|
-
uri =
|
69
|
-
|
70
|
-
data = parse_html(body)
|
71
|
-
filter(data, options)
|
55
|
+
def search(page, options = {})
|
56
|
+
uri = uri_with_options(build_uri(BASE_SCRAPE_URL, options), page)
|
57
|
+
process_request(uri, options)
|
72
58
|
end
|
73
59
|
|
74
|
-
def scrape(type,
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
60
|
+
def scrape(type, options = {})
|
61
|
+
records = []
|
62
|
+
valid_args = [:tops, :popular, :pants, :pants,
|
63
|
+
:accessories, :latest, :search]
|
64
|
+
abort 'invalid parameter - scrape type' unless valid_args.include?(type.to_sym)
|
65
|
+
scrape_what(type, options)
|
79
66
|
end
|
80
67
|
|
81
68
|
private
|
82
|
-
def uri_with_page(uri, page)
|
83
|
-
"#{uri}&pageno=#{page}"
|
84
|
-
end
|
85
|
-
|
86
|
-
def uri_with_search(keyword)
|
87
|
-
"#{@@SEARCH_URI}keyword=#{URI.escape(keyword)}"
|
88
|
-
end
|
89
69
|
|
90
|
-
def
|
91
|
-
|
70
|
+
def process_request(uri, options)
|
71
|
+
body = open_uri(uri)
|
72
|
+
data = extract_data(body)
|
73
|
+
filter(data, options)
|
92
74
|
end
|
93
75
|
|
94
|
-
#
|
95
|
-
# ------------------------------------------------------------
|
76
|
+
# filter by price if the options are not empty
|
96
77
|
def filter(data, options)
|
97
78
|
results = data
|
98
|
-
|
99
79
|
unless options.empty?
|
100
80
|
results = match_price(results, options[:price_boundary]) if options[:price_boundary]
|
101
81
|
end
|
102
|
-
|
103
82
|
results
|
104
83
|
end
|
105
84
|
|
85
|
+
# do the actual extraction of prices from the result set
|
106
86
|
def match_price(data, boundary)
|
107
87
|
lower_bound = boundary.first || 0
|
108
88
|
upper_bound = boundary.last || Float::INFINITY
|
@@ -110,14 +90,39 @@ module JoyceShop
|
|
110
90
|
data.select { |item| lower_bound <= item[:price] && item[:price] <= upper_bound }
|
111
91
|
end
|
112
92
|
|
113
|
-
|
114
|
-
|
115
|
-
|
93
|
+
def build_uri(uri, options = {})
|
94
|
+
opts = { uri: uri }
|
95
|
+
unless options.empty?
|
96
|
+
opts[:keyword] = options[:keyword] if options[:keyword]
|
97
|
+
end
|
98
|
+
opts
|
99
|
+
end
|
100
|
+
|
101
|
+
def uri_with_options(options = {}, page)
|
102
|
+
uri = ''
|
103
|
+
unless options.empty?
|
104
|
+
keyword = options[:keyword] || nil
|
105
|
+
uri << "#{options[:uri]}&pageno=#{page}" if options[:uri]
|
106
|
+
uri << "br=X&keyword=#{URI.escape(keyword)}" if options[:keyword]
|
107
|
+
end
|
108
|
+
uri
|
109
|
+
end
|
110
|
+
|
111
|
+
# try open the URL, fail on error
|
112
|
+
def open_uri(uri)
|
113
|
+
open(uri) {|file| file.read}
|
114
|
+
rescue StandardError
|
115
|
+
'error opening site url'
|
116
|
+
end
|
117
|
+
|
118
|
+
# iterate over every element of item using xpath
|
119
|
+
def extract_data(raw)
|
116
120
|
Oga.parse_html(raw)
|
117
|
-
.xpath(
|
121
|
+
.xpath(ITEM_SELECTOR)
|
118
122
|
.map { |item| parse(item) }
|
119
123
|
end
|
120
124
|
|
125
|
+
# call methods to extract the data using xpath
|
121
126
|
def parse(item)
|
122
127
|
{
|
123
128
|
title: extract_title(item),
|
@@ -127,24 +132,41 @@ module JoyceShop
|
|
127
132
|
}
|
128
133
|
end
|
129
134
|
|
135
|
+
# Iconv is neccessary here otherwise text is unreadable
|
130
136
|
def extract_title(item)
|
131
|
-
item.xpath(
|
132
|
-
.scan(
|
137
|
+
item.xpath(TITLE_SELECTOR).text
|
138
|
+
.scan(TITLE_REGEX)
|
133
139
|
.flatten[0]
|
134
140
|
end
|
135
141
|
|
142
|
+
# get rid of the NT and convert to integer
|
136
143
|
def extract_price(item)
|
137
|
-
item.xpath(
|
144
|
+
item.xpath(PRICE_SELECTOR).text.sub(/NT. /, '').to_i
|
138
145
|
end
|
139
146
|
|
147
|
+
# extract two images and return array or urls
|
140
148
|
def extract_images(item)
|
141
|
-
image = item.xpath(
|
149
|
+
image = item.xpath(IMAGE_SELECTOR).text
|
142
150
|
image_hover = image.sub(/\.jpg/, '-h.jpg')
|
143
|
-
|
151
|
+
image_hover = image.sub(/\.png/, '-h.png') unless image_hover != image
|
152
|
+
["#{BASE_URL}#{image}", "#{BASE_URL}#{image_hover}"]
|
144
153
|
end
|
145
154
|
|
155
|
+
# get the link to the item
|
146
156
|
def extract_link(item)
|
147
|
-
"#{
|
157
|
+
"#{BASE_URL}/#{item.xpath(LINK_SELECTOR).text}"
|
158
|
+
end
|
159
|
+
|
160
|
+
def scrape_what(type, options)
|
161
|
+
records = []
|
162
|
+
pl = options[:page_limit].to_i
|
163
|
+
page_limit = pl != 0 ? pl : 5
|
164
|
+
|
165
|
+
1.upto(page_limit) do |page|
|
166
|
+
method = self.method(type)
|
167
|
+
records.push(method.call(page, options))
|
168
|
+
end
|
169
|
+
records.reject { |c| c.empty? }.flatten(1).uniq
|
148
170
|
end
|
149
171
|
end
|
150
172
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: joyceshop
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Even Chang
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2016-01-
|
14
|
+
date: 2016-01-10 00:00:00.000000000 Z
|
15
15
|
dependencies: []
|
16
16
|
description: This is a gem scraping joyceshop's website and returns the popular/latest
|
17
17
|
items
|