joyceshop 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/bin/joyceshop +31 -3
  3. data/lib/joyceshop/scraper.rb +100 -78
  4. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2c4b143619c3fe55c48d39a9b419f39d6923fce0
4
- data.tar.gz: 1b0fd41b0f9aff1067b680b43e2260a3f7f83cae
3
+ metadata.gz: 05a440353df86f1a449acbe8633aefd1139a0c55
4
+ data.tar.gz: 9cedcb43aca538bb30e5e1de8bf0a17d472e795e
5
5
  SHA512:
6
- metadata.gz: 1b7810cbdc56d47096794d986417082dbe14dc84122e8d0539dbbc2e1f0e4c243f98ba451be45642fafb93fc0f053c0c258e8e4c6fcc6b288caded95c1b961c7
7
- data.tar.gz: bd71e91f8adda7003cd73357e1eb2495c4e262d30e39169242bca461f4baaf0caffc5925849c0a5526f3a9fc3c438e2c05b3df9eb72170b1036e40ef063aa02f
6
+ metadata.gz: e2a8e4481761300cc527fc1103c377a54a1d547ee5d92a3a5fb14ebd60438e10427d8458625ec8484da37f4cb6a5253b464c124ff6a13c7859e67e8de472abfb
7
+ data.tar.gz: 986e94aa57f5ebe8c9470491079c0452c0e06aaa2886d3e767c0191ac3e5c51f07d6b19b462c48b7b20c1a8a5717a4f8e87b8a57ae0fe037a345f23bdf0d9ae2
@@ -2,6 +2,34 @@
2
2
  # require 'joyceshop' # for production
3
3
  require_relative '../lib/joyceshop.rb' # for testing
4
4
 
5
- scraper = JoyceShop::Scraper.new
6
- puts scraper.search('紗針織衫', {price_boundary: [100, 443]})
7
- puts scraper.scrape(:tops, 1)
5
+ @scraper = JoyceShop::Scraper.new
6
+
7
+ # command type keyword lprice hprice page_limit
8
+ def parse_args argv
9
+ input_length = argv.length
10
+ abort 'invalid usage' unless input_length <= 5
11
+
12
+ if input_length == 0 # scrape main category
13
+ @scraper.scrape('latest')
14
+ elsif input_length == 1 # scrape main category
15
+ @scraper.scrape(argv[0])
16
+ elsif input_length == 2
17
+ t = argv[1].to_i
18
+ if t != 0
19
+ options = { page_limit: argv[1] }
20
+ else
21
+ options = { keyword: argv[1] }
22
+ end
23
+ @scraper.scrape(argv[0], options)
24
+ elsif input_length == 3
25
+ options = { keyword: argv[1], page_limit: argv[2] }
26
+ @scraper.scrape(argv[0], options)
27
+ elsif input_length == 5
28
+ options = { keyword: argv[2], page_limit: argv[5],
29
+ price_boundary: [argv[3], argv[4]]
30
+ }
31
+ @scraper.scrape_filter(argv[0], options)
32
+ end
33
+ end
34
+
35
+ puts parse_args ARGV
@@ -5,104 +5,84 @@ require 'open-uri'
5
5
 
6
6
  # scrape data
7
7
  module JoyceShop
8
+ # extract_data class uses xpath selectors to get attribs
8
9
  class Scraper
9
- # Types
10
- @@VALID_TYPES = [:tops, :popular, :pants, :pants, :accessories, :latest]
11
-
12
- # URI
13
- @@BASE_URI = 'https://www.joyce-shop.com'
14
- @@LATEST_URI = "#{@@BASE_URI}/PDList.asp?brand=01&item1=&item2=&ya19=&keyword=&recommand=1412170001&ob=F"
15
- @@POPULAR_URI = "#{@@BASE_URI}/PDList.asp?brand=01&item1=&item2=&ya19=&keyword=&recommand=1305080002&ob=F"
16
- @@TOPS_URI = "#{@@BASE_URI}/PDList.asp?brand=01&item1=110&item2=111&ya19=&keyword=&recommand=&ob=F"
17
- @@PANTS_URI = "#{@@BASE_URI}/PDList.asp?brand=01&item1=120&item2=121&ya19=&keyword=&recommand=&ob=F"
18
- @@ACCESSORIES_URI = "#{@@BASE_URI}/PDList.asp?brand=01&item1=140&item2=141&ya19=&keyword=&recommand=&ob=F"
19
- @@SEARCH_URI = "#{@@BASE_URI}/PDList.asp?"
20
-
21
- # Selectors
22
- @@ITEM_SELECTOR = "//div[contains(@class, 'NEW_shop_list')]/ul/li/div[contains(@class, 'NEW_shop_list_pic')]"
23
- @@LINK_SELECTOR = 'a[1]/@href'
24
- @@IMAGE_SELECTOR = "a/img[contains(@class, 'lazyload')]/@src"
25
- @@ITEM_INFO_SELECTOR = "div[contains(@class, 'NEW_shop_list_info')]"
26
- @@TITLE_SELECTOR = "#{@@ITEM_INFO_SELECTOR}/div[1]"
27
- @@PRICE_SELECTOR = "#{@@ITEM_INFO_SELECTOR}/span"
10
+ BASE_URL = 'https://www.joyce-shop.com'
11
+ BASE_SCRAPE_URL = "#{BASE_URL}/PDList.asp?"
12
+
13
+ LATEST_URI = "#{BASE_SCRAPE_URL}brand=01&item1=&item2=&ya19=&keyword=&recommand=1412170001&ob=F"
14
+ POPULAR_URI = "#{BASE_SCRAPE_URL}brand=01&item1=&item2=&ya19=&keyword=&recommand=1305080002&ob=F"
15
+ TOPS_URI = "#{BASE_SCRAPE_URL}brand=01&item1=110&item2=111&ya19=&keyword=&recommand=&ob=F"
16
+ PANTS_URI = "#{BASE_SCRAPE_URL}brand=01&item1=120&item2=121&ya19=&keyword=&recommand=&ob=F"
17
+ ACCESSORIES_URI = "#{BASE_SCRAPE_URL}brand=01&item1=140&item2=141&ya19=&keyword=&recommand=&ob=F"
18
+
19
+ # xml selectors that will be used to scrape data
20
+ ITEM_SELECTOR = "//div[contains(@class, 'NEW_shop_list')]/ul/li/div[contains(@class, 'NEW_shop_list_pic')]"
21
+ ITEM_INFO_SELECTOR = "div[contains(@class, 'NEW_shop_list_info')]"
22
+ TITLE_SELECTOR = "#{ITEM_INFO_SELECTOR}/div[1]"
23
+ IMAGE_SELECTOR = "a/img[contains(@class, 'lazyload')]/@src"
24
+ PRICE_SELECTOR = "#{ITEM_INFO_SELECTOR}/span"
25
+ LINK_SELECTOR = "a[1]/@href"
28
26
 
29
27
  # Regular
30
- @@TITLE_REGEX = /([.\p{Han}[a-zA-Z]]+)/
28
+ TITLE_REGEX = /([.\p{Han}[a-zA-Z]]+)/
31
29
 
32
- def latest(page, options={})
33
- uri = uri_with_page(@@LATEST_URI, page)
34
- body = fetch_data(uri)
35
- data = parse_html(body)
36
- filter(data, options)
30
+ def latest(page, options = {})
31
+ uri = uri_with_options(build_uri(LATEST_URI, options), page)
32
+ process_request(uri, options)
37
33
  end
38
34
 
39
- def popular(page, options={})
40
- uri = uri_with_page(@@POPULAR_URI, page)
41
- body = fetch_data(uri)
42
- data = parse_html(body)
43
- filter(data, options)
35
+ def popular(page, options = {})
36
+ uri = uri_with_options(build_uri(POPULAR_URI, options), page)
37
+ process_request(uri, options)
44
38
  end
45
39
 
46
- def tops(page, options={})
47
- uri = uri_with_page(@@TOPS_URI, page)
48
- body = fetch_data(uri)
49
- data = parse_html(body)
50
- filter(data, options)
40
+ def tops(page, options = {})
41
+ uri = uri_with_options(build_uri(TOPS_URI, options), page)
42
+ process_request(uri, options)
51
43
  end
52
44
 
53
- def pants(page, options={})
54
- uri = uri_with_page(@@PANTS_URI, page)
55
- body = fetch_data(uri)
56
- data = parse_html(body)
57
- filter(data, options)
45
+ def pants(page, options = {})
46
+ uri = uri_with_options(build_uri(PANTS_URI, options), page)
47
+ process_request(uri, options)
58
48
  end
59
49
 
60
- def accessories(page, options={})
61
- uri = uri_with_page(@@ACCESSORIES_URI, page)
62
- body = fetch_data(uri)
63
- data = parse_html(body)
64
- filter(data, options)
50
+ def accessories(page, options = {})
51
+ uri = uri_with_options(build_uri(ACCESSORIES_URI, options), page)
52
+ process_request(uri, options)
65
53
  end
66
54
 
67
- def search(keyword, options={})
68
- uri = uri_with_search(keyword)
69
- body = fetch_data(uri)
70
- data = parse_html(body)
71
- filter(data, options)
55
+ def search(page, options = {})
56
+ uri = uri_with_options(build_uri(BASE_SCRAPE_URL, options), page)
57
+ process_request(uri, options)
72
58
  end
73
59
 
74
- def scrape(type, page, options = {})
75
- abort "only supports #{@@VALID_TYPES}" unless @@VALID_TYPES.include?(type.to_sym)
76
-
77
- method = self.method(type)
78
- method.call(page, options)
60
+ def scrape(type, options = {})
61
+ records = []
62
+ valid_args = [:tops, :popular, :pants, :pants,
63
+ :accessories, :latest, :search]
64
+ abort 'invalid parameter - scrape type' unless valid_args.include?(type.to_sym)
65
+ scrape_what(type, options)
79
66
  end
80
67
 
81
68
  private
82
- def uri_with_page(uri, page)
83
- "#{uri}&pageno=#{page}"
84
- end
85
-
86
- def uri_with_search(keyword)
87
- "#{@@SEARCH_URI}keyword=#{URI.escape(keyword)}"
88
- end
89
69
 
90
- def fetch_data(uri)
91
- open(uri) { |file| file.read }
70
+ def process_request(uri, options)
71
+ body = open_uri(uri)
72
+ data = extract_data(body)
73
+ filter(data, options)
92
74
  end
93
75
 
94
- # Filter
95
- # ------------------------------------------------------------
76
+ # filter by price if the options are not empty
96
77
  def filter(data, options)
97
78
  results = data
98
-
99
79
  unless options.empty?
100
80
  results = match_price(results, options[:price_boundary]) if options[:price_boundary]
101
81
  end
102
-
103
82
  results
104
83
  end
105
84
 
85
+ # do the actual extraction of prices from the result set
106
86
  def match_price(data, boundary)
107
87
  lower_bound = boundary.first || 0
108
88
  upper_bound = boundary.last || Float::INFINITY
@@ -110,14 +90,39 @@ module JoyceShop
110
90
  data.select { |item| lower_bound <= item[:price] && item[:price] <= upper_bound }
111
91
  end
112
92
 
113
- # Parser
114
- # ------------------------------------------------------------
115
- def parse_html(raw)
93
+ def build_uri(uri, options = {})
94
+ opts = { uri: uri }
95
+ unless options.empty?
96
+ opts[:keyword] = options[:keyword] if options[:keyword]
97
+ end
98
+ opts
99
+ end
100
+
101
+ def uri_with_options(options = {}, page)
102
+ uri = ''
103
+ unless options.empty?
104
+ keyword = options[:keyword] || nil
105
+ uri << "#{options[:uri]}&pageno=#{page}" if options[:uri]
106
+ uri << "br=X&keyword=#{URI.escape(keyword)}" if options[:keyword]
107
+ end
108
+ uri
109
+ end
110
+
111
+ # try open the URL, fail on error
112
+ def open_uri(uri)
113
+ open(uri) {|file| file.read}
114
+ rescue StandardError
115
+ 'error opening site url'
116
+ end
117
+
118
+ # iterate over every element of item using xpath
119
+ def extract_data(raw)
116
120
  Oga.parse_html(raw)
117
- .xpath(@@ITEM_SELECTOR)
121
+ .xpath(ITEM_SELECTOR)
118
122
  .map { |item| parse(item) }
119
123
  end
120
124
 
125
+ # call methods to extract the data using xpath
121
126
  def parse(item)
122
127
  {
123
128
  title: extract_title(item),
@@ -127,24 +132,41 @@ module JoyceShop
127
132
  }
128
133
  end
129
134
 
135
+ # Iconv is neccessary here otherwise text is unreadable
130
136
  def extract_title(item)
131
- item.xpath(@@TITLE_SELECTOR).text
132
- .scan(@@TITLE_REGEX)
137
+ item.xpath(TITLE_SELECTOR).text
138
+ .scan(TITLE_REGEX)
133
139
  .flatten[0]
134
140
  end
135
141
 
142
+ # get rid of the NT and convert to integer
136
143
  def extract_price(item)
137
- item.xpath(@@PRICE_SELECTOR).text.to_i
144
+ item.xpath(PRICE_SELECTOR).text.sub(/NT. /, '').to_i
138
145
  end
139
146
 
147
+ # extract two images and return array or urls
140
148
  def extract_images(item)
141
- image = item.xpath(@@IMAGE_SELECTOR).text
149
+ image = item.xpath(IMAGE_SELECTOR).text
142
150
  image_hover = image.sub(/\.jpg/, '-h.jpg')
143
- ["#{@@BASE_URI}#{image}", "#{@@BASE_URI}#{image_hover}"]
151
+ image_hover = image.sub(/\.png/, '-h.png') unless image_hover != image
152
+ ["#{BASE_URL}#{image}", "#{BASE_URL}#{image_hover}"]
144
153
  end
145
154
 
155
+ # get the link to the item
146
156
  def extract_link(item)
147
- "#{@@BASE_URI}/#{item.xpath(@@LINK_SELECTOR).text}"
157
+ "#{BASE_URL}/#{item.xpath(LINK_SELECTOR).text}"
158
+ end
159
+
160
+ def scrape_what(type, options)
161
+ records = []
162
+ pl = options[:page_limit].to_i
163
+ page_limit = pl != 0 ? pl : 5
164
+
165
+ 1.upto(page_limit) do |page|
166
+ method = self.method(type)
167
+ records.push(method.call(page, options))
168
+ end
169
+ records.reject { |c| c.empty? }.flatten(1).uniq
148
170
  end
149
171
  end
150
172
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: joyceshop
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Even Chang
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2016-01-04 00:00:00.000000000 Z
14
+ date: 2016-01-10 00:00:00.000000000 Z
15
15
  dependencies: []
16
16
  description: This is a gem scraping joyceshop's website and returns the popular/latest
17
17
  items