joyceshop 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/joyceshop +6 -0
- data/lib/joyceshop.rb +2 -0
- data/lib/joyceshop/scraper.rb +101 -0
- metadata +55 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: a3acee65aadff3a04affd99a74cdf5504632c695
|
4
|
+
data.tar.gz: 2709281e4846fe17d1a70649554aa87ddbea2b81
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 2f512e054122cfe33a784207d417de2539687646cf23127dbefe5719270c78c6e7f8740c7103c4242a1ab506b1af2f934a733389c48ba05624243e3cfc3d0d5f
|
7
|
+
data.tar.gz: af79d4953a8ad049e69f7074ca7bbc575ecc5e3df6518a9c6b53b08b0006f6b67ab46a94a625de4be3cc608f47999e7dc00bc9d2ec6f74912dd44dc249953632
|
data/bin/joyceshop
ADDED
data/lib/joyceshop.rb
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'oga'
|
3
|
+
require 'open-uri'
|
4
|
+
|
5
|
+
# scrape data
|
6
|
+
module JoyceShop
|
7
|
+
class Scraper
|
8
|
+
# URI
|
9
|
+
@@BASE_URI = 'https://www.joyce-shop.com'
|
10
|
+
@@LATEST_URI = "#{@@BASE_URI}/PDList.asp?brand=01&item1=&item2=&ya19=&keyword=&recommand=1412170001&ob=F"
|
11
|
+
@@POPULAR_URI = "#{@@BASE_URI}/PDList.asp?brand=01&item1=&item2=&ya19=&keyword=&recommand=1305080002&ob=F"
|
12
|
+
@@TOPS_URI = "#{@@BASE_URI}/PDList.asp?brand=01&item1=110&item2=111&ya19=&keyword=&recommand=&ob=F"
|
13
|
+
@@PANTS_URI = "#{@@BASE_URI}/PDList.asp?brand=01&item1=120&item2=121&ya19=&keyword=&recommand=&ob=F"
|
14
|
+
@@ACCESSORIES_URI = "#{@@BASE_URI}/PDList.asp?brand=01&item1=140&item2=141&ya19=&keyword=&recommand=&ob=F"
|
15
|
+
|
16
|
+
# Selectors
|
17
|
+
@@ITEM_SELECTOR = "//div[contains(@class, 'NEW_shop_list')]/ul/li/div[contains(@class, 'NEW_shop_list_pic')]"
|
18
|
+
@@LINK_SELECTOR = 'a'
|
19
|
+
@@IMAGE_SELECTOR = "a/img[contains(@class, 'lazyload')]"
|
20
|
+
@@ITEM_INFO_SELECTOR = "div[contains(@class, 'NEW_shop_list_info')]"
|
21
|
+
@@TITLE_SELECTOR = "#{@@ITEM_INFO_SELECTOR}/div[1]"
|
22
|
+
@@PRICE_SELECTOR = "#{@@ITEM_INFO_SELECTOR}/span"
|
23
|
+
|
24
|
+
# Regular
|
25
|
+
@@TITLE_REGEX = /([.\p{Han}[a-zA-Z]]+)/
|
26
|
+
|
27
|
+
def latest(page)
|
28
|
+
uri = uri_with_page(@@LATEST_URI, page)
|
29
|
+
body = fetch_data(uri)
|
30
|
+
filter(body)
|
31
|
+
end
|
32
|
+
|
33
|
+
def popular(page)
|
34
|
+
uri = uri_with_page(@@POPULAR_URI, page)
|
35
|
+
body = fetch_data(uri)
|
36
|
+
filter(body)
|
37
|
+
end
|
38
|
+
|
39
|
+
def tops(page)
|
40
|
+
uri = uri_with_page(@@TOPS_URI, page)
|
41
|
+
body = fetch_data(uri)
|
42
|
+
filter(body)
|
43
|
+
end
|
44
|
+
|
45
|
+
def pants(page)
|
46
|
+
uri = uri_with_page(@@PANTS_URI, page)
|
47
|
+
body = fetch_data(uri)
|
48
|
+
filter(body)
|
49
|
+
end
|
50
|
+
|
51
|
+
def accessories(page)
|
52
|
+
uri = uri_with_page(@@ACCESSORIES_URI, page)
|
53
|
+
body = fetch_data(uri)
|
54
|
+
filter(body)
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
def uri_with_page(uri, page)
|
59
|
+
"#{uri}&pageno=#{page}"
|
60
|
+
end
|
61
|
+
|
62
|
+
def fetch_data(uri)
|
63
|
+
open(uri) {|file| file.read}
|
64
|
+
end
|
65
|
+
|
66
|
+
def filter(raw)
|
67
|
+
Oga.parse_html(raw)
|
68
|
+
.xpath(@@ITEM_SELECTOR)
|
69
|
+
.map { |item| parse(item) }
|
70
|
+
end
|
71
|
+
|
72
|
+
def parse(item)
|
73
|
+
{
|
74
|
+
title: extract_title(item),
|
75
|
+
price: extract_price(item),
|
76
|
+
images: extract_images(item),
|
77
|
+
link: extract_link(item)
|
78
|
+
}
|
79
|
+
end
|
80
|
+
|
81
|
+
def extract_title(item)
|
82
|
+
item.xpath(@@TITLE_SELECTOR).text
|
83
|
+
.scan(@@TITLE_REGEX)
|
84
|
+
.flatten[0]
|
85
|
+
end
|
86
|
+
|
87
|
+
def extract_price(item)
|
88
|
+
item.xpath(@@PRICE_SELECTOR).text.to_i
|
89
|
+
end
|
90
|
+
|
91
|
+
def extract_images(item)
|
92
|
+
image = item.xpath(@@IMAGE_SELECTOR).attribute(:src).first.value
|
93
|
+
image_hover = image.sub(/\.jpg/, '-h.jpg')
|
94
|
+
["#{@@BASE_URI}#{image}", "#{@@BASE_URI}#{image_hover}"]
|
95
|
+
end
|
96
|
+
|
97
|
+
def extract_link(item)
|
98
|
+
"#{@@BASE_URI}/#{item.xpath(@@LINK_SELECTOR).attribute(:href).first.value}"
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
metadata
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: joyceshop
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Even Chang
|
8
|
+
- Luis Herrera
|
9
|
+
- Katy Lee
|
10
|
+
- Frank Lee
|
11
|
+
autorequire:
|
12
|
+
bindir: bin
|
13
|
+
cert_chain: []
|
14
|
+
date: 2015-12-14 00:00:00.000000000 Z
|
15
|
+
dependencies: []
|
16
|
+
description: This is a gem scraping joyceshop's website and returns the popular/latest
|
17
|
+
items
|
18
|
+
email:
|
19
|
+
- kiki44552002@gmail.com
|
20
|
+
- lmherrera86@gmail.com
|
21
|
+
- katylee41024@yahoo.com.tw
|
22
|
+
- frank1234211@gmail.com
|
23
|
+
executables:
|
24
|
+
- joyceshop
|
25
|
+
extensions: []
|
26
|
+
extra_rdoc_files: []
|
27
|
+
files:
|
28
|
+
- bin/joyceshop
|
29
|
+
- lib/joyceshop.rb
|
30
|
+
- lib/joyceshop/scraper.rb
|
31
|
+
homepage: http://rubygems.org/gems/joyceshop
|
32
|
+
licenses:
|
33
|
+
- MIT
|
34
|
+
metadata: {}
|
35
|
+
post_install_message:
|
36
|
+
rdoc_options: []
|
37
|
+
require_paths:
|
38
|
+
- lib
|
39
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - ">="
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: '0'
|
49
|
+
requirements: []
|
50
|
+
rubyforge_project:
|
51
|
+
rubygems_version: 2.4.7
|
52
|
+
signing_key:
|
53
|
+
specification_version: 4
|
54
|
+
summary: Scraper for JoyceShop
|
55
|
+
test_files: []
|