stylemooncat 0.0.10 → 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6739f622f28c58608c1db334945e4816776838c1
4
- data.tar.gz: 3529fa4d57a1a4c6ec2f1bef6926780c73153d58
3
+ metadata.gz: f5cfb085ac84735961b167a059fe04482f180b8d
4
+ data.tar.gz: ad30174c3781867cf7405451fb3fa5c9651d35ee
5
5
  SHA512:
6
- metadata.gz: e3f979775d6fb6ef19ebeb553f616e42b9b0e7ec836d7646eee7246932861fef9c8bfaf57bcfba9975fda4179d5345946b205277c849c938fab7b58c3936210d
7
- data.tar.gz: a6be7b887e27661828b08775ffa0cfb4933e47bde7b6541f86615683598f17d933121463c7651e0e2c5cdbbe729d811de53c995a8b5832ca31764e799063cb95
6
+ metadata.gz: 8caf2f92614c4dd5ad4ecbe933abbb174f45903b31ae59870d8ccb07840a34a7e4977f421c6b7decdb7cd2ab3ff43d4c22e84f02f76ab5eca635598c9158416f
7
+ data.tar.gz: 25d506a372bce080542617797244f4fcad86c1fe4844c0453962d6ff4fee4ecaaa422ccfcbcf600933a2cd9b82c463b49fe87d835ccf2b84357c58580317f235
data/bin/stylemooncat CHANGED
@@ -1,25 +1,35 @@
1
1
  #!/usr/bin/env ruby
2
-
3
- require_relative '../lib/stylemooncat.rb'
2
+ # require 'queenshop' # for production
3
+ require_relative '../lib/stylemooncat'
4
4
 
5
5
  @scraper = StyleMoonCat::Scraper.new
6
- options ={}
7
- options[:page_limit]=ARGV[1]
8
-
9
- price_boundary=[]
10
- price_boundary.push(ARGV[3])
11
- price_boundary.push(ARGV[4])
12
6
 
13
- options[:keyword]=ARGV[2]
14
- options[:price_boundary]=price_boundary
7
+ # command type keyword lprice hprice page_limit
8
+ def parse_args argv
9
+ input_length = argv.length
10
+ abort 'invalid usage' unless input_length <= 5
15
11
 
16
- #puts @scraper.scrape(ARGV[0],ARGV[1],ARGV[2],ARGV[3],ARGV[4],ARGV[5])
17
- if ARGV[5]=='color'
18
- puts @scraper.scrape_contain_color(ARGV[0],options)
19
- else
20
- puts @scraper.scrape(ARGV[0],options)
12
+ if input_length == 0 # scrape main category
13
+ @scraper.scrape('latest')
14
+ elsif input_length == 1 # scrape main category
15
+ @scraper.scrape(argv[0])
16
+ elsif input_length == 2
17
+ t = argv[1].to_i
18
+ if t != 0
19
+ options = { page_limit: argv[1] }
20
+ else
21
+ options = { keyword: argv[1] }
22
+ end
23
+ @scraper.scrape(argv[0], options)
24
+ elsif input_length == 3
25
+ options = { keyword: argv[1], page_limit: argv[2] }
26
+ @scraper.scrape(argv[0], options)
27
+ elsif input_length == 5
28
+ options = { keyword: argv[2], page_limit: argv[5],
29
+ price_boundary: [argv[3], argv[4]]
30
+ }
31
+ @scraper.scrape_filter(argv[0], options)
32
+ end
21
33
  end
22
34
 
23
- #puts @scraper.scrape("shoes",{:keyword=>"none",:page_limit=>3,:price_boundary=>[0,600]})
24
- #puts '-----------'
25
- #puts @scraper.scrape("shoes",{:keyword=>"跟鞋",:page_limit=>1})
35
+ puts parse_args ARGV
@@ -1,236 +1,127 @@
1
1
  #!/usr/bin/env ruby
2
2
  require 'oga'
3
+ require 'iconv'
3
4
  require 'open-uri'
4
5
 
5
-
6
6
  # scrape data
7
7
  module StyleMoonCat
8
+ # extract_data class uses xpath selectors to get attribs
8
9
  class Scraper
9
- # URI
10
- @@BASE_URI = 'http://www.stylemooncat.com.tw'
11
-
12
- @@NEW_ARRIVALS_URI = "#{@@BASE_URI}/PDList.asp?recommand=1312090001"
13
- @@LAST_WEEK_URI = "#{@@BASE_URI}/PDList.asp?recommand=1312090002"
14
- @@SPECIAL_DISCOUNT_URI = "#{@@BASE_URI}/PDList.asp?recommand=1312090003"
15
-
16
- @@ALL_ITEMS_URI = "#{@@BASE_URI}/PDList.asp?item1=00"
17
- @@TOP_URI = "#{@@BASE_URI}/PDList.asp?p1=01"
18
- @@TOPS_Clothes_URI = "#{@@BASE_URI}/PDList.asp?p1=01&p2=01"
19
- @@TOPS_Tshirt_URI = "#{@@BASE_URI}/PDList.asp?p1=01&p2=02"
20
- @@TOPS_Vest_URI = "#{@@BASE_URI}/PDList.asp?p1=01&p2=03"
21
- @@TOPS_Blouse_URI = "#{@@BASE_URI}/PDList.asp?p1=01&p2=04"
22
- @@TOPS_Knit_URI = "#{@@BASE_URI}/PDList.asp?p1=01&p2=05"
23
- @@BOTTOM_URI = "#{@@BASE_URI}/PDList.asp?p1=02"
24
- @@BOTTOM_Pants_URI = "#{@@BASE_URI}/PDList.asp?p1=02&p2=01"
25
- @@BOTTOM_Skirts_URI = "#{@@BASE_URI}/PDList.asp?p1=02&p2=02"
26
- @@OUTER_URI = "#{@@BASE_URI}/PDList.asp?p1=03"
27
- @@OUTER_Coat_URI = "#{@@BASE_URI}/PDList.asp?p1=03&p2=01"
28
- @@OUTER_Jacket_URI = "#{@@BASE_URI}/PDList.asp?p1=03&p2=02"
29
- @@OUTER_Knit_URI = "#{@@BASE_URI}/PDList.asp?p1=03&p2=03"
30
- @@OUTER_Vest_URI = "#{@@BASE_URI}/PDList.asp?p1=03&p2=04"
31
- @@DRESS_URI = "#{@@BASE_URI}/PDList.asp?p1=04"
32
- @@SHOES_AND_BAGS_URI = "#{@@BASE_URI}/PDList.asp?p1=05"
33
- @@SHOES_URI = "#{@@BASE_URI}/PDList.asp?p1=05&p2=01"
34
- @@BAG_URI = "#{@@BASE_URI}/PDList.asp?p1=05&p2=02"
35
- @@ACCESSORIES_URI = "#{@@BASE_URI}/PDList.asp?p1=06"
36
- @@ACCESSORIES_Watch_URI = "#{@@BASE_URI}/PDList.asp?p1=06&p2=01"
37
- @@ACCESSORIES_Necklace_URI = "#{@@BASE_URI}/PDList.asp?p1=06&p2=02"
38
- @@ACCESSORIES_Ring_URI = "#{@@BASE_URI}/PDList.asp?p1=06&p2=03"
39
- @@ACCESSORIES_Bracelet_URI = "#{@@BASE_URI}/PDList.asp?p1=06&p2=04"
40
- @@ACCESSORIES_Earring_URI = "#{@@BASE_URI}/PDList.asp?p1=06&p2=05"
41
- @@ACCESSORIES_Muffler_URI = "#{@@BASE_URI}/PDList.asp?p1=06&p2=06"
42
- @@ACCESSORIES_Belt_URI = "#{@@BASE_URI}/PDList.asp?p1=06&p2=07"
43
- @@ACCESSORIES_Haircap_URI = "#{@@BASE_URI}/PDList.asp?p1=06&p2=08"
44
- @@ACCESSORIES_Glasses_URI = "#{@@BASE_URI}/PDList.asp?p1=06&p2=09"
45
- @@ACCESSORIES_Socks_URI = "#{@@BASE_URI}/PDList.asp?p1=06&p2=10"
46
- @@ACCESSORIES_Underwear_URI = "#{@@BASE_URI}/PDList.asp?p1=06&p2=11"
47
- @@ACCESSORIES_Others_URI = "#{@@BASE_URI}/PDList.asp?p1=06&p2=12"
48
- # Selectors
49
- @@ITEM_XPATH = "//div[contains(@class, 'goodsBox')]/div[contains(@class, 'goodl')]"
50
- @@LINK_XPATH = 'a'
51
- @@IMAGE_XPATH = "a/img"
52
- @@TITLE_XPATH = "div[contains(@class, 'pd_info_l')]" # /div[contains(@class, 'pd_info_l')] is wrong
53
- @@PRICE_SPAN_XPATH = "div[contains(@class, 'pd_info_l')]/span"
54
- @@PRICE_STRIKE_XPATH = "div[contains(@class, 'pd_info_l')]/strike"
55
-
56
- # Regular ?
57
- @@TITLE_REGEX = /([.\p{Han}[a-zA-Z]]+)/
10
+ BASE_URL = 'http://www.stylemooncat.com.tw'
11
+ BASE_SCRAPE_URL = "#{BASE_URL}/PDList.asp?"
12
+
13
+ SEARCH_URI = "#{BASE_URL}item1=00"
14
+ LATEST_URI = "#{BASE_SCRAPE_URL}recommand=1312090001"
15
+ POPULAR_URI = "#{BASE_SCRAPE_URL}/recommand=1312090003"
16
+ TOPS_URI = "#{BASE_SCRAPE_URL}p1=01"
17
+ PANTS_URI = "#{BASE_SCRAPE_URL}p1=02&p2=01"
18
+ ACCESSORIES_URI = "#{BASE_SCRAPE_URL}p1=06"
19
+
20
+ # xml selectors that will be used to scrape data
21
+ ITEM_SELECTOR = "//div[@class='goodsBox']/div[@class='goodl']"
22
+ TITLE_SELECTOR = "div[@class='pd_info_l']//text()[not(parent::span)]"
23
+ IMAGE_SELECTOR = "a/img/@src"
24
+ PRICE_SELECTOR = "div[@class='pd_info_l']/span//text()[not(parent::strike)]"
25
+ LINK_SELECTOR = "a/@href"
26
+
27
+ def latest(page, options = {})
28
+ uri = uri_with_options(build_uri(LATEST_URI, options), page)
29
+ process_request(uri, options)
30
+ end
58
31
 
59
- @@IsScrpeColor=0;
32
+ def popular(page, options = {})
33
+ uri = uri_with_options(build_uri(POPULAR_URI, options), page)
34
+ process_request(uri, options)
35
+ end
60
36
 
61
- @@COLOR_ITEM_XPATH = "//option"
37
+ def tops(page, options = {})
38
+ uri = uri_with_options(build_uri(TOPS_URI, options), page)
39
+ process_request(uri, options)
40
+ end
62
41
 
63
- def scrape_contain_color(category,options={})
64
- @@IsScrapeColor=1
65
- filter_results = scrape(category,options)
66
- filter_results_with_color = filter_results.each do |x|
67
- # puts x[:link]
68
- body = fetch_data(x[:link])
69
- color = color_extract(body)
70
- x[:colors]= color
71
- end
42
+ def pants(page, options = {})
43
+ uri = uri_with_options(build_uri(PANTS_URI, options), page)
44
+ process_request(uri, options)
45
+ end
72
46
 
73
- return filter_results_with_color
47
+ def accessories(page, options = {})
48
+ uri = uri_with_options(build_uri(ACCESSORIES_URI, options), page)
49
+ process_request(uri, options)
74
50
  end
75
51
 
76
- def color_extract(raw)
77
- # puts Oga.parse_html(raw).xpath(@@ITEM_XPATH).map { |item| parse(item) }
78
- result = Oga.parse_html(raw)
79
- .xpath(@@COLOR_ITEM_XPATH)
80
- .select { |item| item.text.length >4 }
81
- .map { |item| color_parse(item) }
82
- .uniq
83
- return result
52
+ def search(page, options = {})
53
+ uri = uri_with_options(build_uri(BASE_SCRAPE_URL, options), page)
54
+ process_request(uri, options)
84
55
  end
85
56
 
86
- def color_parse(item)
87
- item.text.split(" ")[0].split(":")[1]
57
+ def scrape(type, options = {})
58
+ records = []
59
+ valid_args = [:tops, :popular, :pants, :pants,
60
+ :accessories, :latest, :search]
61
+ abort 'invalid parameter - scrape type' unless valid_args.include?(type.to_sym)
62
+ scrape_what(type, options)
88
63
  end
89
64
 
90
- def scrape(category,options={})
91
- options[:keyword]
65
+ private
92
66
 
93
- # keyword= Iconv.conv('big5','utf-8',options[:keyword])
94
- keyword= options[:keyword]
95
- page_limit=options[:page_limit]
67
+ def process_request(uri, options)
68
+ body = open_uri(uri)
69
+ data = extract_data(body)
70
+ filter(data, options)
71
+ end
96
72
 
97
- if options[:price_boundary]!= nil && options[:price_boundary].length ==2
98
- if options[:price_boundary][0].to_i>options[:price_boundary][1].to_i
99
- price_from = options[:price_boundary][1]
100
- price_to = options[:price_boundary][0]
101
- else
102
- price_from = options[:price_boundary][0]
103
- price_to = options[:price_boundary][1]
104
- end
105
- else
106
- price_from = -1
107
- price_to = -1
73
+ # filter by price if the options are not empty
74
+ def filter(data, options)
75
+ results = data
76
+ unless options.empty?
77
+ results = match_price(results, options[:price_boundary]) if options[:price_boundary]
108
78
  end
109
- @filter_results=[]
110
- @count=1
111
- 1.upto(page_limit.to_i) do
112
- page = @count
113
- case category
114
- when "newarrival"
115
- uri = uri_with_page(@@NEW_ARRIVALS_URI, page)
116
- when "lastweek"
117
- uri = uri_with_page(@@LAST_WEEK_URI, page)
118
- when "specialdiscount"
119
- uri = uri_with_page(@@SPECIAL_DISCOUNT_URI, page)
120
- when "top"
121
- uri = uri_with_page(@@TOP_URI, page)
122
- when "top_clothes"
123
- uri = uri_with_page(@@TOPS_Clothes_URI, page)
124
- when "top_Tshirt"
125
- uri = uri_with_page(@@TOPS_Tshirt_URI, page)
126
- when "top_vest"
127
- uri = uri_with_page(@@TOPS_Vest_URI, page)
128
- when "top_blouse"
129
- uri = uri_with_page(@@TOPS_Blouse_URI, page)
130
- when "top_knit"
131
- uri = uri_with_page(@@TOPS_Knit_URI, page)
132
- when "bottom"
133
- uri = uri_with_page(@@BOTTOM_URI, page)
134
- when "bottom_pants"
135
- uri = uri_with_page(@@BOTTOM_Pants_URI, page)
136
- when "bottom_skirts"
137
- uri = uri_with_page(@@BOTTOM_Skirts_URI, page)
138
- when "outer"
139
- uri = uri_with_page(@@OUTER_URI, page)
140
- when "outer_coat"
141
- uri = uri_with_page(@@OUTER_Coat_URI, page)
142
- when "outer_jacket"
143
- uri = uri_with_page(@@OUTER_Jacket_URI, page)
144
- when "outer_knit"
145
- uri = uri_with_page(@@OUTER_Knit_URI, page)
146
- when "outer_vest"
147
- uri = uri_with_page(@@OUTER_Vest_URI, page)
148
-
149
- when "dress"
150
- uri = uri_with_page(@@DRESS_URI, page)
151
- when "shoes_and_bag"
152
- uri = uri_with_page(@@SHOES_AND_BAGS_URI, page)
153
- when "shoes"
154
- uri = uri_with_page(@@SHOES_URI, page)
155
- when "bag"
156
- uri = uri_with_page(@@BAG_URI, page)
157
- when "accessories"
158
- uri = uri_with_page(@@ACCESSORIES_URI, page)
159
- when "accessories_watch"
160
- uri = uri_with_page(@@ACCESSORIES_Watch_URI, page)
161
- when "accessories_necklace"
162
- uri = uri_with_page(@@ACCESSORIES_Necklace_URI, page)
163
- when "accessories_ring"
164
- uri = uri_with_page(@@ACCESSORIES_Ring_URI, page)
165
- when "accessories_bracelet"
166
- uri = uri_with_page(@@ACCESSORIES_Bracelet_URI, page)
167
- when "accessories_earring"
168
- uri = uri_with_page(@@ACCESSORIES_Earring_URI, page)
169
- when "accessories_muffler"
170
- uri = uri_with_page(@@ACCESSORIES_Muffler_URI, page)
171
- when "accessories_belt"
172
- uri = uri_with_page(@@ACCESSORIES_Belt_URI, page)
173
- when "accessories_haircap"
174
- uri = uri_with_page(@@ACCESSORIES_Haircap_URI, page)
175
- when "accessories_glasses"
176
- uri = uri_with_page(@@ACCESSORIES_Glasses_URI, page)
177
- when "accessories_socks"
178
- uri = uri_with_page(@@ACCESSORIES_Socks_URI, page)
179
- when "accessories_underwear"
180
- uri = uri_with_page(@@ACCESSORIES_Underwear_URI, page)
181
- when "accessories_others"
182
- uri = uri_with_page(@@ACCESSORIES_Others_URI, page)
183
- else
184
- uri = uri_with_page(@@ALL_ITEMS_URI, page)
185
- end
186
-
187
- if (keyword != "none") && (keyword != nil)
188
- uri = uri_with_keyword(uri,keyword)
189
- end
190
- # puts uri
191
- body = fetch_data(uri)
192
- @filter_results = filter(body)
79
+ results
80
+ end
193
81
 
194
- if @count==1
195
- @combine_filter_results = @filter_results
196
- @count +=1
197
- else
198
- if @filter_results.length>0
199
- @combine_filter_results= @final_filter_results.concat(@filter_results)
200
- end
201
- end
202
- end
203
- @count=1
82
+ # do the actual extraction of prices from the result set
83
+ def match_price(data, boundary)
84
+ lower_bound = boundary.first || 0
85
+ upper_bound = boundary.last || Float::INFINITY
204
86
 
205
- #filter with price if there are correct price parameters
206
- if price_to!=nil && price_from!=nil && price_to.to_i >=price_from.to_i && price_from.to_i !=-1 && price_to.to_i !=-1
207
- return @combine_filter_results.select{|x| x[:price].to_i<=price_to.to_i && x[:price].to_i>=price_from.to_i }
208
- else
209
- return @combine_filter_results
210
- end
87
+ data.select { |item| lower_bound <= item[:price] && item[:price] <= upper_bound }
211
88
  end
212
89
 
213
-
214
- private
215
- def uri_with_keyword(uri, keyword)
216
- "#{uri}&keyword=#{URI.escape(keyword)}"
90
+ def build_uri(uri, options = {})
91
+ opts = { uri: uri }
92
+ unless options.empty?
93
+ opts[:keyword] = options[:keyword] if options[:keyword]
94
+ end
95
+ opts
217
96
  end
218
97
 
219
- def uri_with_page(uri, page)
220
- "#{uri}&pageno=#{page}"
98
+ def uri_with_options(options = {}, page)
99
+ uri = ''
100
+ unless options.empty?
101
+ kw = options[:keyword] || nil
102
+ #ic = Iconv.new('big5','UTF-8')
103
+ keyword = kw
104
+ uri << "#{options[:uri]}&pageno=#{page}" if options[:uri]
105
+ uri << "br=X&keyword=#{URI.escape(keyword)}" if options[:keyword]
106
+ end
107
+ uri
221
108
  end
222
109
 
223
- def fetch_data(uri)
110
+ # try open the URL, fail on error
111
+ def open_uri(uri)
224
112
  open(uri) {|file| file.read}
113
+ rescue StandardError
114
+ 'error opening site url'
225
115
  end
226
116
 
227
- def filter(raw)
228
- # puts Oga.parse_html(raw).xpath(@@ITEM_XPATH).map { |item| parse(item) }
117
+ # iterate over every element of item using xpath
118
+ def extract_data(raw)
229
119
  Oga.parse_html(raw)
230
- .xpath(@@ITEM_XPATH)
120
+ .xpath(ITEM_SELECTOR)
231
121
  .map { |item| parse(item) }
232
122
  end
233
123
 
124
+ # call methods to extract the data using xpath
234
125
  def parse(item)
235
126
  {
236
127
  title: extract_title(item),
@@ -240,40 +131,42 @@ module StyleMoonCat
240
131
  }
241
132
  end
242
133
 
134
+ # Iconv is neccessary here otherwise text is unreadable
243
135
  def extract_title(item)
244
- item.xpath(@@TITLE_XPATH).text.split("TWD")[0]
136
+ ic = Iconv.new('UTF-8','big5')
137
+ raw_title = item.xpath(TITLE_SELECTOR).text
138
+ ic.iconv(raw_title)
245
139
  end
246
140
 
141
+ # get rid of the NT and convert to integer
247
142
  def extract_price(item)
248
-
249
- # if there is discount, priceString format is "originPirce sellingPrice"
250
- # .split(' ') is fail. so use this method to extract sellingPrice
251
- priceString = item.xpath(@@TITLE_XPATH).text.split("TWD.")[1]
252
- length = priceString.length
253
- if length ==8 || length ==9 #ex: priceString == "1200 990" or "1200 1100"
254
- space = priceString[4]
255
- result = priceString.split(space)[1]
256
- elsif length ==7 || length ==6 #ex: priceString == "999 990" or "120 99"
257
- space = priceString[3]
258
- result = priceString.split(space)[1]
259
- elsif length ==5 #ex: priceString == "99 90"
260
- space = priceString[2]
261
- result = priceString.split(space)[1]
262
- else #no discount
263
- result = priceString
264
- end
265
-
266
- result
143
+ price_str = item.xpath(PRICE_SELECTOR).text
144
+ price_str.sub(/TWD./, '').gsub("\u00a0", ' ').to_i
267
145
  end
268
146
 
147
+ # extract two images and return array or urls
269
148
  def extract_images(item)
270
- result=[]
271
- result.push('http://www.stylemooncat.com.tw'+item.xpath(@@IMAGE_XPATH).attribute(:src).first.value)
272
-
149
+ image = item.xpath(IMAGE_SELECTOR).text
150
+ # image_hover = image.sub(/\.jpg/, '-h.jpg')
151
+ # image_hover = image.sub(/\.png/, '-h.png') unless image_hover != image
152
+ ["#{BASE_URL}#{image}"]
273
153
  end
274
154
 
155
+ # get the link to the item
275
156
  def extract_link(item)
276
- "#{@@BASE_URI}/#{item.xpath(@@LINK_XPATH).attribute(:href).first.value}"
157
+ "#{BASE_URL}/#{item.xpath(LINK_SELECTOR).text}"
158
+ end
159
+
160
+ def scrape_what(type, options)
161
+ records = []
162
+ pl = options[:page_limit].to_i
163
+ page_limit = pl != 0 ? pl : 5
164
+
165
+ 1.upto(page_limit) do |page|
166
+ method = self.method(type)
167
+ records.push(method.call(page, options))
168
+ end
169
+ records.reject { |c| c.empty? }.flatten(1).uniq
277
170
  end
278
171
  end
279
172
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: stylemooncat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.10
4
+ version: 0.0.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Even Chang
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2016-01-06 00:00:00.000000000 Z
14
+ date: 2016-01-07 00:00:00.000000000 Z
15
15
  dependencies: []
16
16
  description: This is a gem scraping StyleMoonCat's website.Input category name,page
17
17
  limit,searcing keyword,and price range,and it will return the items with title,price,image,and