joyceshop 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/joyceshop +6 -0
- data/lib/joyceshop.rb +2 -0
- data/lib/joyceshop/scraper.rb +101 -0
- metadata +55 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: a3acee65aadff3a04affd99a74cdf5504632c695
|
4
|
+
data.tar.gz: 2709281e4846fe17d1a70649554aa87ddbea2b81
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 2f512e054122cfe33a784207d417de2539687646cf23127dbefe5719270c78c6e7f8740c7103c4242a1ab506b1af2f934a733389c48ba05624243e3cfc3d0d5f
|
7
|
+
data.tar.gz: af79d4953a8ad049e69f7074ca7bbc575ecc5e3df6518a9c6b53b08b0006f6b67ab46a94a625de4be3cc608f47999e7dc00bc9d2ec6f74912dd44dc249953632
|
data/bin/joyceshop
ADDED
data/lib/joyceshop.rb
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'oga'
|
3
|
+
require 'open-uri'
|
4
|
+
|
5
|
+
# scrape data
|
6
|
+
module JoyceShop
|
7
|
+
class Scraper
|
8
|
+
# URI
|
9
|
+
@@BASE_URI = 'https://www.joyce-shop.com'
|
10
|
+
@@LATEST_URI = "#{@@BASE_URI}/PDList.asp?brand=01&item1=&item2=&ya19=&keyword=&recommand=1412170001&ob=F"
|
11
|
+
@@POPULAR_URI = "#{@@BASE_URI}/PDList.asp?brand=01&item1=&item2=&ya19=&keyword=&recommand=1305080002&ob=F"
|
12
|
+
@@TOPS_URI = "#{@@BASE_URI}/PDList.asp?brand=01&item1=110&item2=111&ya19=&keyword=&recommand=&ob=F"
|
13
|
+
@@PANTS_URI = "#{@@BASE_URI}/PDList.asp?brand=01&item1=120&item2=121&ya19=&keyword=&recommand=&ob=F"
|
14
|
+
@@ACCESSORIES_URI = "#{@@BASE_URI}/PDList.asp?brand=01&item1=140&item2=141&ya19=&keyword=&recommand=&ob=F"
|
15
|
+
|
16
|
+
# Selectors
|
17
|
+
@@ITEM_SELECTOR = "//div[contains(@class, 'NEW_shop_list')]/ul/li/div[contains(@class, 'NEW_shop_list_pic')]"
|
18
|
+
@@LINK_SELECTOR = 'a'
|
19
|
+
@@IMAGE_SELECTOR = "a/img[contains(@class, 'lazyload')]"
|
20
|
+
@@ITEM_INFO_SELECTOR = "div[contains(@class, 'NEW_shop_list_info')]"
|
21
|
+
@@TITLE_SELECTOR = "#{@@ITEM_INFO_SELECTOR}/div[1]"
|
22
|
+
@@PRICE_SELECTOR = "#{@@ITEM_INFO_SELECTOR}/span"
|
23
|
+
|
24
|
+
# Regular
|
25
|
+
@@TITLE_REGEX = /([.\p{Han}[a-zA-Z]]+)/
|
26
|
+
|
27
|
+
def latest(page)
|
28
|
+
uri = uri_with_page(@@LATEST_URI, page)
|
29
|
+
body = fetch_data(uri)
|
30
|
+
filter(body)
|
31
|
+
end
|
32
|
+
|
33
|
+
def popular(page)
|
34
|
+
uri = uri_with_page(@@POPULAR_URI, page)
|
35
|
+
body = fetch_data(uri)
|
36
|
+
filter(body)
|
37
|
+
end
|
38
|
+
|
39
|
+
def tops(page)
|
40
|
+
uri = uri_with_page(@@TOPS_URI, page)
|
41
|
+
body = fetch_data(uri)
|
42
|
+
filter(body)
|
43
|
+
end
|
44
|
+
|
45
|
+
def pants(page)
|
46
|
+
uri = uri_with_page(@@PANTS_URI, page)
|
47
|
+
body = fetch_data(uri)
|
48
|
+
filter(body)
|
49
|
+
end
|
50
|
+
|
51
|
+
def accessories(page)
|
52
|
+
uri = uri_with_page(@@ACCESSORIES_URI, page)
|
53
|
+
body = fetch_data(uri)
|
54
|
+
filter(body)
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
def uri_with_page(uri, page)
|
59
|
+
"#{uri}&pageno=#{page}"
|
60
|
+
end
|
61
|
+
|
62
|
+
def fetch_data(uri)
|
63
|
+
open(uri) {|file| file.read}
|
64
|
+
end
|
65
|
+
|
66
|
+
def filter(raw)
|
67
|
+
Oga.parse_html(raw)
|
68
|
+
.xpath(@@ITEM_SELECTOR)
|
69
|
+
.map { |item| parse(item) }
|
70
|
+
end
|
71
|
+
|
72
|
+
def parse(item)
|
73
|
+
{
|
74
|
+
title: extract_title(item),
|
75
|
+
price: extract_price(item),
|
76
|
+
images: extract_images(item),
|
77
|
+
link: extract_link(item)
|
78
|
+
}
|
79
|
+
end
|
80
|
+
|
81
|
+
def extract_title(item)
|
82
|
+
item.xpath(@@TITLE_SELECTOR).text
|
83
|
+
.scan(@@TITLE_REGEX)
|
84
|
+
.flatten[0]
|
85
|
+
end
|
86
|
+
|
87
|
+
def extract_price(item)
|
88
|
+
item.xpath(@@PRICE_SELECTOR).text.to_i
|
89
|
+
end
|
90
|
+
|
91
|
+
def extract_images(item)
|
92
|
+
image = item.xpath(@@IMAGE_SELECTOR).attribute(:src).first.value
|
93
|
+
image_hover = image.sub(/\.jpg/, '-h.jpg')
|
94
|
+
["#{@@BASE_URI}#{image}", "#{@@BASE_URI}#{image_hover}"]
|
95
|
+
end
|
96
|
+
|
97
|
+
def extract_link(item)
|
98
|
+
"#{@@BASE_URI}/#{item.xpath(@@LINK_SELECTOR).attribute(:href).first.value}"
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
metadata
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: joyceshop
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Even Chang
|
8
|
+
- Luis Herrera
|
9
|
+
- Katy Lee
|
10
|
+
- Frank Lee
|
11
|
+
autorequire:
|
12
|
+
bindir: bin
|
13
|
+
cert_chain: []
|
14
|
+
date: 2015-12-14 00:00:00.000000000 Z
|
15
|
+
dependencies: []
|
16
|
+
description: This is a gem scraping joyceshop's website and returns the popular/latest
|
17
|
+
items
|
18
|
+
email:
|
19
|
+
- kiki44552002@gmail.com
|
20
|
+
- lmherrera86@gmail.com
|
21
|
+
- katylee41024@yahoo.com.tw
|
22
|
+
- frank1234211@gmail.com
|
23
|
+
executables:
|
24
|
+
- joyceshop
|
25
|
+
extensions: []
|
26
|
+
extra_rdoc_files: []
|
27
|
+
files:
|
28
|
+
- bin/joyceshop
|
29
|
+
- lib/joyceshop.rb
|
30
|
+
- lib/joyceshop/scraper.rb
|
31
|
+
homepage: http://rubygems.org/gems/joyceshop
|
32
|
+
licenses:
|
33
|
+
- MIT
|
34
|
+
metadata: {}
|
35
|
+
post_install_message:
|
36
|
+
rdoc_options: []
|
37
|
+
require_paths:
|
38
|
+
- lib
|
39
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - ">="
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: '0'
|
49
|
+
requirements: []
|
50
|
+
rubyforge_project:
|
51
|
+
rubygems_version: 2.4.7
|
52
|
+
signing_key:
|
53
|
+
specification_version: 4
|
54
|
+
summary: Scraper for JoyceShop
|
55
|
+
test_files: []
|