joyceshop 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/bin/joyceshop +31 -3
  3. data/lib/joyceshop/scraper.rb +100 -78
  4. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2c4b143619c3fe55c48d39a9b419f39d6923fce0
4
- data.tar.gz: 1b0fd41b0f9aff1067b680b43e2260a3f7f83cae
3
+ metadata.gz: 05a440353df86f1a449acbe8633aefd1139a0c55
4
+ data.tar.gz: 9cedcb43aca538bb30e5e1de8bf0a17d472e795e
5
5
  SHA512:
6
- metadata.gz: 1b7810cbdc56d47096794d986417082dbe14dc84122e8d0539dbbc2e1f0e4c243f98ba451be45642fafb93fc0f053c0c258e8e4c6fcc6b288caded95c1b961c7
7
- data.tar.gz: bd71e91f8adda7003cd73357e1eb2495c4e262d30e39169242bca461f4baaf0caffc5925849c0a5526f3a9fc3c438e2c05b3df9eb72170b1036e40ef063aa02f
6
+ metadata.gz: e2a8e4481761300cc527fc1103c377a54a1d547ee5d92a3a5fb14ebd60438e10427d8458625ec8484da37f4cb6a5253b464c124ff6a13c7859e67e8de472abfb
7
+ data.tar.gz: 986e94aa57f5ebe8c9470491079c0452c0e06aaa2886d3e767c0191ac3e5c51f07d6b19b462c48b7b20c1a8a5717a4f8e87b8a57ae0fe037a345f23bdf0d9ae2
@@ -2,6 +2,34 @@
2
2
  # require 'joyceshop' # for production
3
3
  require_relative '../lib/joyceshop.rb' # for testing
4
4
 
5
- scraper = JoyceShop::Scraper.new
6
- puts scraper.search('紗針織衫', {price_boundary: [100, 443]})
7
- puts scraper.scrape(:tops, 1)
5
+ @scraper = JoyceShop::Scraper.new
6
+
7
+ # command type keyword lprice hprice page_limit
8
+ def parse_args argv
9
+ input_length = argv.length
10
+ abort 'invalid usage' unless input_length <= 5
11
+
12
+ if input_length == 0 # scrape main category
13
+ @scraper.scrape('latest')
14
+ elsif input_length == 1 # scrape main category
15
+ @scraper.scrape(argv[0])
16
+ elsif input_length == 2
17
+ t = argv[1].to_i
18
+ if t != 0
19
+ options = { page_limit: argv[1] }
20
+ else
21
+ options = { keyword: argv[1] }
22
+ end
23
+ @scraper.scrape(argv[0], options)
24
+ elsif input_length == 3
25
+ options = { keyword: argv[1], page_limit: argv[2] }
26
+ @scraper.scrape(argv[0], options)
27
+ elsif input_length == 5
28
+ options = { keyword: argv[2], page_limit: argv[5],
29
+ price_boundary: [argv[3], argv[4]]
30
+ }
31
+ @scraper.scrape_filter(argv[0], options)
32
+ end
33
+ end
34
+
35
+ puts parse_args ARGV
@@ -5,104 +5,84 @@ require 'open-uri'
5
5
 
6
6
  # scrape data
7
7
  module JoyceShop
8
+ # extract_data class uses xpath selectors to get attribs
8
9
  class Scraper
9
- # Types
10
- @@VALID_TYPES = [:tops, :popular, :pants, :pants, :accessories, :latest]
11
-
12
- # URI
13
- @@BASE_URI = 'https://www.joyce-shop.com'
14
- @@LATEST_URI = "#{@@BASE_URI}/PDList.asp?brand=01&item1=&item2=&ya19=&keyword=&recommand=1412170001&ob=F"
15
- @@POPULAR_URI = "#{@@BASE_URI}/PDList.asp?brand=01&item1=&item2=&ya19=&keyword=&recommand=1305080002&ob=F"
16
- @@TOPS_URI = "#{@@BASE_URI}/PDList.asp?brand=01&item1=110&item2=111&ya19=&keyword=&recommand=&ob=F"
17
- @@PANTS_URI = "#{@@BASE_URI}/PDList.asp?brand=01&item1=120&item2=121&ya19=&keyword=&recommand=&ob=F"
18
- @@ACCESSORIES_URI = "#{@@BASE_URI}/PDList.asp?brand=01&item1=140&item2=141&ya19=&keyword=&recommand=&ob=F"
19
- @@SEARCH_URI = "#{@@BASE_URI}/PDList.asp?"
20
-
21
- # Selectors
22
- @@ITEM_SELECTOR = "//div[contains(@class, 'NEW_shop_list')]/ul/li/div[contains(@class, 'NEW_shop_list_pic')]"
23
- @@LINK_SELECTOR = 'a[1]/@href'
24
- @@IMAGE_SELECTOR = "a/img[contains(@class, 'lazyload')]/@src"
25
- @@ITEM_INFO_SELECTOR = "div[contains(@class, 'NEW_shop_list_info')]"
26
- @@TITLE_SELECTOR = "#{@@ITEM_INFO_SELECTOR}/div[1]"
27
- @@PRICE_SELECTOR = "#{@@ITEM_INFO_SELECTOR}/span"
10
+ BASE_URL = 'https://www.joyce-shop.com'
11
+ BASE_SCRAPE_URL = "#{BASE_URL}/PDList.asp?"
12
+
13
+ LATEST_URI = "#{BASE_SCRAPE_URL}brand=01&item1=&item2=&ya19=&keyword=&recommand=1412170001&ob=F"
14
+ POPULAR_URI = "#{BASE_SCRAPE_URL}brand=01&item1=&item2=&ya19=&keyword=&recommand=1305080002&ob=F"
15
+ TOPS_URI = "#{BASE_SCRAPE_URL}brand=01&item1=110&item2=111&ya19=&keyword=&recommand=&ob=F"
16
+ PANTS_URI = "#{BASE_SCRAPE_URL}brand=01&item1=120&item2=121&ya19=&keyword=&recommand=&ob=F"
17
+ ACCESSORIES_URI = "#{BASE_SCRAPE_URL}brand=01&item1=140&item2=141&ya19=&keyword=&recommand=&ob=F"
18
+
19
+ # xml selectors that will be used to scrape data
20
+ ITEM_SELECTOR = "//div[contains(@class, 'NEW_shop_list')]/ul/li/div[contains(@class, 'NEW_shop_list_pic')]"
21
+ ITEM_INFO_SELECTOR = "div[contains(@class, 'NEW_shop_list_info')]"
22
+ TITLE_SELECTOR = "#{ITEM_INFO_SELECTOR}/div[1]"
23
+ IMAGE_SELECTOR = "a/img[contains(@class, 'lazyload')]/@src"
24
+ PRICE_SELECTOR = "#{ITEM_INFO_SELECTOR}/span"
25
+ LINK_SELECTOR = "a[1]/@href"
28
26
 
29
27
  # Regular
30
- @@TITLE_REGEX = /([.\p{Han}[a-zA-Z]]+)/
28
+ TITLE_REGEX = /([.\p{Han}[a-zA-Z]]+)/
31
29
 
32
- def latest(page, options={})
33
- uri = uri_with_page(@@LATEST_URI, page)
34
- body = fetch_data(uri)
35
- data = parse_html(body)
36
- filter(data, options)
30
+ def latest(page, options = {})
31
+ uri = uri_with_options(build_uri(LATEST_URI, options), page)
32
+ process_request(uri, options)
37
33
  end
38
34
 
39
- def popular(page, options={})
40
- uri = uri_with_page(@@POPULAR_URI, page)
41
- body = fetch_data(uri)
42
- data = parse_html(body)
43
- filter(data, options)
35
+ def popular(page, options = {})
36
+ uri = uri_with_options(build_uri(POPULAR_URI, options), page)
37
+ process_request(uri, options)
44
38
  end
45
39
 
46
- def tops(page, options={})
47
- uri = uri_with_page(@@TOPS_URI, page)
48
- body = fetch_data(uri)
49
- data = parse_html(body)
50
- filter(data, options)
40
+ def tops(page, options = {})
41
+ uri = uri_with_options(build_uri(TOPS_URI, options), page)
42
+ process_request(uri, options)
51
43
  end
52
44
 
53
- def pants(page, options={})
54
- uri = uri_with_page(@@PANTS_URI, page)
55
- body = fetch_data(uri)
56
- data = parse_html(body)
57
- filter(data, options)
45
+ def pants(page, options = {})
46
+ uri = uri_with_options(build_uri(PANTS_URI, options), page)
47
+ process_request(uri, options)
58
48
  end
59
49
 
60
- def accessories(page, options={})
61
- uri = uri_with_page(@@ACCESSORIES_URI, page)
62
- body = fetch_data(uri)
63
- data = parse_html(body)
64
- filter(data, options)
50
+ def accessories(page, options = {})
51
+ uri = uri_with_options(build_uri(ACCESSORIES_URI, options), page)
52
+ process_request(uri, options)
65
53
  end
66
54
 
67
- def search(keyword, options={})
68
- uri = uri_with_search(keyword)
69
- body = fetch_data(uri)
70
- data = parse_html(body)
71
- filter(data, options)
55
+ def search(page, options = {})
56
+ uri = uri_with_options(build_uri(BASE_SCRAPE_URL, options), page)
57
+ process_request(uri, options)
72
58
  end
73
59
 
74
- def scrape(type, page, options = {})
75
- abort "only supports #{@@VALID_TYPES}" unless @@VALID_TYPES.include?(type.to_sym)
76
-
77
- method = self.method(type)
78
- method.call(page, options)
60
+ def scrape(type, options = {})
61
+ records = []
62
+ valid_args = [:tops, :popular, :pants, :pants,
63
+ :accessories, :latest, :search]
64
+ abort 'invalid parameter - scrape type' unless valid_args.include?(type.to_sym)
65
+ scrape_what(type, options)
79
66
  end
80
67
 
81
68
  private
82
- def uri_with_page(uri, page)
83
- "#{uri}&pageno=#{page}"
84
- end
85
-
86
- def uri_with_search(keyword)
87
- "#{@@SEARCH_URI}keyword=#{URI.escape(keyword)}"
88
- end
89
69
 
90
- def fetch_data(uri)
91
- open(uri) { |file| file.read }
70
+ def process_request(uri, options)
71
+ body = open_uri(uri)
72
+ data = extract_data(body)
73
+ filter(data, options)
92
74
  end
93
75
 
94
- # Filter
95
- # ------------------------------------------------------------
76
+ # filter by price if the options are not empty
96
77
  def filter(data, options)
97
78
  results = data
98
-
99
79
  unless options.empty?
100
80
  results = match_price(results, options[:price_boundary]) if options[:price_boundary]
101
81
  end
102
-
103
82
  results
104
83
  end
105
84
 
85
+ # do the actual extraction of prices from the result set
106
86
  def match_price(data, boundary)
107
87
  lower_bound = boundary.first || 0
108
88
  upper_bound = boundary.last || Float::INFINITY
@@ -110,14 +90,39 @@ module JoyceShop
110
90
  data.select { |item| lower_bound <= item[:price] && item[:price] <= upper_bound }
111
91
  end
112
92
 
113
- # Parser
114
- # ------------------------------------------------------------
115
- def parse_html(raw)
93
+ def build_uri(uri, options = {})
94
+ opts = { uri: uri }
95
+ unless options.empty?
96
+ opts[:keyword] = options[:keyword] if options[:keyword]
97
+ end
98
+ opts
99
+ end
100
+
101
+ def uri_with_options(options = {}, page)
102
+ uri = ''
103
+ unless options.empty?
104
+ keyword = options[:keyword] || nil
105
+ uri << "#{options[:uri]}&pageno=#{page}" if options[:uri]
106
+ uri << "br=X&keyword=#{URI.escape(keyword)}" if options[:keyword]
107
+ end
108
+ uri
109
+ end
110
+
111
+ # try open the URL, fail on error
112
+ def open_uri(uri)
113
+ open(uri) {|file| file.read}
114
+ rescue StandardError
115
+ 'error opening site url'
116
+ end
117
+
118
+ # iterate over every element of item using xpath
119
+ def extract_data(raw)
116
120
  Oga.parse_html(raw)
117
- .xpath(@@ITEM_SELECTOR)
121
+ .xpath(ITEM_SELECTOR)
118
122
  .map { |item| parse(item) }
119
123
  end
120
124
 
125
+ # call methods to extract the data using xpath
121
126
  def parse(item)
122
127
  {
123
128
  title: extract_title(item),
@@ -127,24 +132,41 @@ module JoyceShop
127
132
  }
128
133
  end
129
134
 
135
+ # Iconv is neccessary here otherwise text is unreadable
130
136
  def extract_title(item)
131
- item.xpath(@@TITLE_SELECTOR).text
132
- .scan(@@TITLE_REGEX)
137
+ item.xpath(TITLE_SELECTOR).text
138
+ .scan(TITLE_REGEX)
133
139
  .flatten[0]
134
140
  end
135
141
 
142
+ # get rid of the NT and convert to integer
136
143
  def extract_price(item)
137
- item.xpath(@@PRICE_SELECTOR).text.to_i
144
+ item.xpath(PRICE_SELECTOR).text.sub(/NT. /, '').to_i
138
145
  end
139
146
 
147
+ # extract two images and return array or urls
140
148
  def extract_images(item)
141
- image = item.xpath(@@IMAGE_SELECTOR).text
149
+ image = item.xpath(IMAGE_SELECTOR).text
142
150
  image_hover = image.sub(/\.jpg/, '-h.jpg')
143
- ["#{@@BASE_URI}#{image}", "#{@@BASE_URI}#{image_hover}"]
151
+ image_hover = image.sub(/\.png/, '-h.png') unless image_hover != image
152
+ ["#{BASE_URL}#{image}", "#{BASE_URL}#{image_hover}"]
144
153
  end
145
154
 
155
+ # get the link to the item
146
156
  def extract_link(item)
147
- "#{@@BASE_URI}/#{item.xpath(@@LINK_SELECTOR).text}"
157
+ "#{BASE_URL}/#{item.xpath(LINK_SELECTOR).text}"
158
+ end
159
+
160
+ def scrape_what(type, options)
161
+ records = []
162
+ pl = options[:page_limit].to_i
163
+ page_limit = pl != 0 ? pl : 5
164
+
165
+ 1.upto(page_limit) do |page|
166
+ method = self.method(type)
167
+ records.push(method.call(page, options))
168
+ end
169
+ records.reject { |c| c.empty? }.flatten(1).uniq
148
170
  end
149
171
  end
150
172
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: joyceshop
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Even Chang
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2016-01-04 00:00:00.000000000 Z
14
+ date: 2016-01-10 00:00:00.000000000 Z
15
15
  dependencies: []
16
16
  description: This is a gem scraping joyceshop's website and returns the popular/latest
17
17
  items