stylemooncat 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/stylemooncat +6 -0
- data/lib/stylemooncat.rb +2 -0
- data/lib/stylemooncat/scraper.rb +156 -0
- metadata +55 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 8c7e37b6424726cdec5907fa1fed3985228a2eca
|
4
|
+
data.tar.gz: 2c42777971df54b19607c98cb352966f8c8313ce
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 1cae3ab2235f3022a2d54712b0591191dbb467aadcfb85ff80ad2e3505c5b4eaf839a907e32f803a5926417614e54869978fbaa36a6e0582be51ada006a0831a
|
7
|
+
data.tar.gz: db75b57f6f91455db0cf1c5ff3df22b0d86b495c882c8f0b2bef3c7ca5c4765389a9c1d947950bc38117d438bc958b8ded05a2e2f0d797605c3c4faa6c69554d
|
data/bin/stylemooncat
ADDED
data/lib/stylemooncat.rb
ADDED
@@ -0,0 +1,156 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'oga'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'open-uri-s3'
|
5
|
+
|
6
|
+
# scrape data
|
7
|
+
module StyleMoonCat
|
8
|
+
class Scraper
|
9
|
+
# URI
|
10
|
+
@@BASE_URI = 'http://www.stylemooncat.com.tw'
|
11
|
+
|
12
|
+
@@NEW_ARRIVALS_URI = "#{@@BASE_URI}/PDList.asp?recommand=1312090001"
|
13
|
+
@@LAST_WEEK_URI = "#{@@BASE_URI}/PDList.asp?recommand=1312090002"
|
14
|
+
@@SPECIAL_DISCOUNT_URI = "#{@@BASE_URI}/PDList.asp?recommand=1312090003"
|
15
|
+
|
16
|
+
@@TOP_URI = "#{@@BASE_URI}/PDList.asp?p1=01"
|
17
|
+
@@BOTTOM_URI = "#{@@BASE_URI}/PDList.asp?p1=02"
|
18
|
+
@@OUTER_URI = "#{@@BASE_URI}/PDList.asp?p1=03"
|
19
|
+
@@DRESS_URI = "#{@@BASE_URI}/PDList.asp?p1=04"
|
20
|
+
@@SHOES_URI = "#{@@BASE_URI}/PDList.asp?p1=05&p2=01"
|
21
|
+
@@BAG_URI = "#{@@BASE_URI}/PDList.asp?p1=05&p2=02"
|
22
|
+
@@ACCESSORIES_URI = "#{@@BASE_URI}/PDList.asp?p1=06"
|
23
|
+
|
24
|
+
# Selectors
|
25
|
+
@@ITEM_XPATH = "//div[contains(@class, 'goodsBox')]/div[contains(@class, 'goodl')]"
|
26
|
+
@@LINK_XPATH = 'a'
|
27
|
+
@@IMAGE_XPATH = "a/img"
|
28
|
+
@@TITLE_XPATH = "div[contains(@class, 'pd_info_l')]" # /div[contains(@class, 'pd_info_l')] is wrong
|
29
|
+
@@PRICE_SPAN_XPATH = "div[contains(@class, 'pd_info_l')]/span"
|
30
|
+
@@PRICE_STRIKE_XPATH = "div[contains(@class, 'pd_info_l')]/strike"
|
31
|
+
|
32
|
+
# Regular ?
|
33
|
+
@@TITLE_REGEX = /([.\p{Han}[a-zA-Z]]+)/
|
34
|
+
|
35
|
+
def get_new_arrival(page)
|
36
|
+
uri = uri_with_page(@@NEW_ARRIVALS_URI, page)
|
37
|
+
body = fetch_data(uri)
|
38
|
+
filter(body)
|
39
|
+
end
|
40
|
+
|
41
|
+
def get_last_week(page)
|
42
|
+
uri = uri_with_page(@@LAST_WEEK_URI, page)
|
43
|
+
body = fetch_data(uri)
|
44
|
+
filter(body)
|
45
|
+
end
|
46
|
+
|
47
|
+
def get_special_discount(page)
|
48
|
+
uri = uri_with_page(@@SPECIAL_DISCOUNT_URI, page)
|
49
|
+
body = fetch_data(uri)
|
50
|
+
filter(body)
|
51
|
+
end
|
52
|
+
|
53
|
+
def get_top(page)
|
54
|
+
uri = uri_with_page(@@TOP_URI, page)
|
55
|
+
body = fetch_data(uri)
|
56
|
+
filter(body)
|
57
|
+
end
|
58
|
+
|
59
|
+
def get_bottom(page)
|
60
|
+
uri = uri_with_page(@@BOTTOM_URI, page)
|
61
|
+
body = fetch_data(uri)
|
62
|
+
filter(body)
|
63
|
+
end
|
64
|
+
|
65
|
+
def get_outer(page)
|
66
|
+
uri = uri_with_page(@@OUTER_URI, page)
|
67
|
+
body = fetch_data(uri)
|
68
|
+
filter(body)
|
69
|
+
end
|
70
|
+
|
71
|
+
def get_dress(page)
|
72
|
+
uri = uri_with_page(@@DRESS_URI, page)
|
73
|
+
body = fetch_data(uri)
|
74
|
+
filter(body)
|
75
|
+
end
|
76
|
+
|
77
|
+
def get_shoes(page)
|
78
|
+
uri = uri_with_page(@@SHOES_URI, page)
|
79
|
+
body = fetch_data(uri)
|
80
|
+
filter(body)
|
81
|
+
end
|
82
|
+
|
83
|
+
def get_bag(page)
|
84
|
+
uri = uri_with_page(@@BAG_URI, page)
|
85
|
+
body = fetch_data(uri)
|
86
|
+
filter(body)
|
87
|
+
end
|
88
|
+
|
89
|
+
def get_accessories(page)
|
90
|
+
uri = uri_with_page(@@ACCESSORIES_URI, page)
|
91
|
+
body = fetch_data(uri)
|
92
|
+
filter(body)
|
93
|
+
end
|
94
|
+
|
95
|
+
|
96
|
+
private
|
97
|
+
def uri_with_page(uri, page)
|
98
|
+
"#{uri}&pageno=#{page}"
|
99
|
+
end
|
100
|
+
|
101
|
+
def fetch_data(uri)
|
102
|
+
puts uri
|
103
|
+
open(uri) {|file| file.read}
|
104
|
+
end
|
105
|
+
|
106
|
+
def filter(raw)
|
107
|
+
# puts Oga.parse_html(raw).xpath(@@ITEM_XPATH).map { |item| parse(item) }
|
108
|
+
Oga.parse_html(raw)
|
109
|
+
.xpath(@@ITEM_XPATH)
|
110
|
+
.map { |item| parse(item) }
|
111
|
+
end
|
112
|
+
|
113
|
+
def parse(item)
|
114
|
+
{
|
115
|
+
title: extract_title(item),
|
116
|
+
price: extract_price(item),
|
117
|
+
images: extract_images(item),
|
118
|
+
link: extract_link(item)
|
119
|
+
}
|
120
|
+
end
|
121
|
+
|
122
|
+
def extract_title(item)
|
123
|
+
item.xpath(@@TITLE_XPATH).text.split("TWD")[0]
|
124
|
+
end
|
125
|
+
|
126
|
+
def extract_price(item)
|
127
|
+
|
128
|
+
# if there is discount, priceString format is "originPirce sellingPrice"
|
129
|
+
# .split(' ') is fail. so use this method to extract sellingPrice
|
130
|
+
priceString = item.xpath(@@TITLE_XPATH).text.split("TWD.")[1]
|
131
|
+
length = priceString.length
|
132
|
+
if length ==8 || length ==9 #ex: priceString == "1200 990" or "1200 1100"
|
133
|
+
space = priceString[4]
|
134
|
+
result = priceString.split(space)[1]
|
135
|
+
elsif length ==7 || length ==6 #ex: priceString == "999 990" or "120 99"
|
136
|
+
space = priceString[3]
|
137
|
+
result = priceString.split(space)[1]
|
138
|
+
elsif length ==5 #ex: priceString == "99 90"
|
139
|
+
space = priceString[2]
|
140
|
+
result = priceString.split(space)[1]
|
141
|
+
else #no discount
|
142
|
+
result = priceString
|
143
|
+
end
|
144
|
+
puts result
|
145
|
+
result
|
146
|
+
end
|
147
|
+
|
148
|
+
def extract_images(item)
|
149
|
+
item.xpath(@@IMAGE_XPATH).attribute(:src).first.value
|
150
|
+
end
|
151
|
+
|
152
|
+
def extract_link(item)
|
153
|
+
"#{@@BASE_URI}/#{item.xpath(@@LINK_XPATH).attribute(:href).first.value}"
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
metadata
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: stylemooncat
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Even Chang
|
8
|
+
- Luis Herrera
|
9
|
+
- Katy Lee
|
10
|
+
- Frank Lee
|
11
|
+
autorequire:
|
12
|
+
bindir: bin
|
13
|
+
cert_chain: []
|
14
|
+
date: 2015-12-19 00:00:00.000000000 Z
|
15
|
+
dependencies: []
|
16
|
+
description: This is a gem scraping StyleMoonCat's website and returns certain category's
|
17
|
+
items with title,price,image,and link
|
18
|
+
email:
|
19
|
+
- kiki44552002@gmail.com
|
20
|
+
- lmherrera86@gmail.com
|
21
|
+
- katylee41024@yahoo.com.tw
|
22
|
+
- frank1234211@gmail.com
|
23
|
+
executables:
|
24
|
+
- stylemooncat
|
25
|
+
extensions: []
|
26
|
+
extra_rdoc_files: []
|
27
|
+
files:
|
28
|
+
- bin/stylemooncat
|
29
|
+
- lib/stylemooncat.rb
|
30
|
+
- lib/stylemooncat/scraper.rb
|
31
|
+
homepage: http://rubygems.org/gems/stylemooncat
|
32
|
+
licenses:
|
33
|
+
- MIT
|
34
|
+
metadata: {}
|
35
|
+
post_install_message:
|
36
|
+
rdoc_options: []
|
37
|
+
require_paths:
|
38
|
+
- lib
|
39
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - ">="
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: '0'
|
49
|
+
requirements: []
|
50
|
+
rubyforge_project:
|
51
|
+
rubygems_version: 2.4.6
|
52
|
+
signing_key:
|
53
|
+
specification_version: 4
|
54
|
+
summary: Scraper for StyleMoonCat
|
55
|
+
test_files: []
|