stylemooncat 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 8c7e37b6424726cdec5907fa1fed3985228a2eca
4
+ data.tar.gz: 2c42777971df54b19607c98cb352966f8c8313ce
5
+ SHA512:
6
+ metadata.gz: 1cae3ab2235f3022a2d54712b0591191dbb467aadcfb85ff80ad2e3505c5b4eaf839a907e32f803a5926417614e54869978fbaa36a6e0582be51ada006a0831a
7
+ data.tar.gz: db75b57f6f91455db0cf1c5ff3df22b0d86b495c882c8f0b2bef3c7ca5c4765389a9c1d947950bc38117d438bc958b8ded05a2e2f0d797605c3c4faa6c69554d
data/bin/stylemooncat ADDED
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/stylemooncat.rb'
4
+
5
+ @scraper = StyleMoonCat::Scraper.new
6
+ puts @scraper.get_top(ARGV[0])
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env ruby
2
+ require_relative 'stylemooncat/scraper'
@@ -0,0 +1,156 @@
1
+ #!/usr/bin/env ruby
2
+ require 'oga'
3
+ require 'open-uri'
4
+ require 'open-uri-s3'
5
+
6
+ # scrape data
7
+ module StyleMoonCat
8
+ class Scraper
9
+ # URI
10
+ @@BASE_URI = 'http://www.stylemooncat.com.tw'
11
+
12
+ @@NEW_ARRIVALS_URI = "#{@@BASE_URI}/PDList.asp?recommand=1312090001"
13
+ @@LAST_WEEK_URI = "#{@@BASE_URI}/PDList.asp?recommand=1312090002"
14
+ @@SPECIAL_DISCOUNT_URI = "#{@@BASE_URI}/PDList.asp?recommand=1312090003"
15
+
16
+ @@TOP_URI = "#{@@BASE_URI}/PDList.asp?p1=01"
17
+ @@BOTTOM_URI = "#{@@BASE_URI}/PDList.asp?p1=02"
18
+ @@OUTER_URI = "#{@@BASE_URI}/PDList.asp?p1=03"
19
+ @@DRESS_URI = "#{@@BASE_URI}/PDList.asp?p1=04"
20
+ @@SHOES_URI = "#{@@BASE_URI}/PDList.asp?p1=05&p2=01"
21
+ @@BAG_URI = "#{@@BASE_URI}/PDList.asp?p1=05&p2=02"
22
+ @@ACCESSORIES_URI = "#{@@BASE_URI}/PDList.asp?p1=06"
23
+
24
+ # Selectors
25
+ @@ITEM_XPATH = "//div[contains(@class, 'goodsBox')]/div[contains(@class, 'goodl')]"
26
+ @@LINK_XPATH = 'a'
27
+ @@IMAGE_XPATH = "a/img"
28
+ @@TITLE_XPATH = "div[contains(@class, 'pd_info_l')]" # /div[contains(@class, 'pd_info_l')] is wrong
29
+ @@PRICE_SPAN_XPATH = "div[contains(@class, 'pd_info_l')]/span"
30
+ @@PRICE_STRIKE_XPATH = "div[contains(@class, 'pd_info_l')]/strike"
31
+
32
+ # Regular ?
33
+ @@TITLE_REGEX = /([.\p{Han}[a-zA-Z]]+)/
34
+
35
+ def get_new_arrival(page)
36
+ uri = uri_with_page(@@NEW_ARRIVALS_URI, page)
37
+ body = fetch_data(uri)
38
+ filter(body)
39
+ end
40
+
41
+ def get_last_week(page)
42
+ uri = uri_with_page(@@LAST_WEEK_URI, page)
43
+ body = fetch_data(uri)
44
+ filter(body)
45
+ end
46
+
47
+ def get_special_discount(page)
48
+ uri = uri_with_page(@@SPECIAL_DISCOUNT_URI, page)
49
+ body = fetch_data(uri)
50
+ filter(body)
51
+ end
52
+
53
+ def get_top(page)
54
+ uri = uri_with_page(@@TOP_URI, page)
55
+ body = fetch_data(uri)
56
+ filter(body)
57
+ end
58
+
59
+ def get_bottom(page)
60
+ uri = uri_with_page(@@BOTTOM_URI, page)
61
+ body = fetch_data(uri)
62
+ filter(body)
63
+ end
64
+
65
+ def get_outer(page)
66
+ uri = uri_with_page(@@OUTER_URI, page)
67
+ body = fetch_data(uri)
68
+ filter(body)
69
+ end
70
+
71
+ def get_dress(page)
72
+ uri = uri_with_page(@@DRESS_URI, page)
73
+ body = fetch_data(uri)
74
+ filter(body)
75
+ end
76
+
77
+ def get_shoes(page)
78
+ uri = uri_with_page(@@SHOES_URI, page)
79
+ body = fetch_data(uri)
80
+ filter(body)
81
+ end
82
+
83
+ def get_bag(page)
84
+ uri = uri_with_page(@@BAG_URI, page)
85
+ body = fetch_data(uri)
86
+ filter(body)
87
+ end
88
+
89
+ def get_accessories(page)
90
+ uri = uri_with_page(@@ACCESSORIES_URI, page)
91
+ body = fetch_data(uri)
92
+ filter(body)
93
+ end
94
+
95
+
96
+ private
97
+ def uri_with_page(uri, page)
98
+ "#{uri}&pageno=#{page}"
99
+ end
100
+
101
+ def fetch_data(uri)
102
+ puts uri
103
+ open(uri) {|file| file.read}
104
+ end
105
+
106
+ def filter(raw)
107
+ # puts Oga.parse_html(raw).xpath(@@ITEM_XPATH).map { |item| parse(item) }
108
+ Oga.parse_html(raw)
109
+ .xpath(@@ITEM_XPATH)
110
+ .map { |item| parse(item) }
111
+ end
112
+
113
+ def parse(item)
114
+ {
115
+ title: extract_title(item),
116
+ price: extract_price(item),
117
+ images: extract_images(item),
118
+ link: extract_link(item)
119
+ }
120
+ end
121
+
122
+ def extract_title(item)
123
+ item.xpath(@@TITLE_XPATH).text.split("TWD")[0]
124
+ end
125
+
126
+ def extract_price(item)
127
+
128
+ # if there is discount, priceString format is "originPirce sellingPrice"
129
+ # .split(' ') is fail. so use this method to extract sellingPrice
130
+ priceString = item.xpath(@@TITLE_XPATH).text.split("TWD.")[1]
131
+ length = priceString.length
132
+ if length ==8 || length ==9 #ex: priceString == "1200 990" or "1200 1100"
133
+ space = priceString[4]
134
+ result = priceString.split(space)[1]
135
+ elsif length ==7 || length ==6 #ex: priceString == "999 990" or "120 99"
136
+ space = priceString[3]
137
+ result = priceString.split(space)[1]
138
+ elsif length ==5 #ex: priceString == "99 90"
139
+ space = priceString[2]
140
+ result = priceString.split(space)[1]
141
+ else #no discount
142
+ result = priceString
143
+ end
144
+ puts result
145
+ result
146
+ end
147
+
148
+ def extract_images(item)
149
+ item.xpath(@@IMAGE_XPATH).attribute(:src).first.value
150
+ end
151
+
152
+ def extract_link(item)
153
+ "#{@@BASE_URI}/#{item.xpath(@@LINK_XPATH).attribute(:href).first.value}"
154
+ end
155
+ end
156
+ end
metadata ADDED
@@ -0,0 +1,55 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: stylemooncat
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Even Chang
8
+ - Luis Herrera
9
+ - Katy Lee
10
+ - Frank Lee
11
+ autorequire:
12
+ bindir: bin
13
+ cert_chain: []
14
+ date: 2015-12-19 00:00:00.000000000 Z
15
+ dependencies: []
16
+ description: This is a gem scraping StyleMoonCat's website and returns certain category's
17
+ items with title,price,image,and link
18
+ email:
19
+ - kiki44552002@gmail.com
20
+ - lmherrera86@gmail.com
21
+ - katylee41024@yahoo.com.tw
22
+ - frank1234211@gmail.com
23
+ executables:
24
+ - stylemooncat
25
+ extensions: []
26
+ extra_rdoc_files: []
27
+ files:
28
+ - bin/stylemooncat
29
+ - lib/stylemooncat.rb
30
+ - lib/stylemooncat/scraper.rb
31
+ homepage: http://rubygems.org/gems/stylemooncat
32
+ licenses:
33
+ - MIT
34
+ metadata: {}
35
+ post_install_message:
36
+ rdoc_options: []
37
+ require_paths:
38
+ - lib
39
+ required_ruby_version: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ required_rubygems_version: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: '0'
49
+ requirements: []
50
+ rubyforge_project:
51
+ rubygems_version: 2.4.6
52
+ signing_key:
53
+ specification_version: 4
54
+ summary: Scraper for StyleMoonCat
55
+ test_files: []