tomosia_amanaplus_crawl 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +0 -1
- data/README.md +10 -1
- data/lib/tomosia_amanaplus_crawl.rb +16 -3
- data/lib/tomosia_amanaplus_crawl/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2832ce228e9bc2ebcce58c1dde236cc0f6bfa4c789654e976cb66defe368fb39
|
4
|
+
data.tar.gz: 790070002318d2d3c3727b9fd3986b655624579d3130e32e0ba6c88bd65485cd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8b174ec110e8933aae815abef670ae0a33ae69077b98268522aaf83a52702f7d3c303cc9bbf73d5e8ec7d27a9cb2deb1cd1d05e6edfc9b5ecff8fa94b49af4d9
|
7
|
+
data.tar.gz: badfef5587e0b7989a313caf4490f59efba61d9e743f16126ae1bd7915abd7d878be1467bbcef967c4968c590c7c4383f542c0a8a51f5bf07783759ae4b216da
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -10,6 +10,9 @@ Add this line to your application's Gemfile:
|
|
10
10
|
|
11
11
|
```ruby
|
12
12
|
gem 'tomosia_amanaplus_crawl'
|
13
|
+
gem 'httparty'
|
14
|
+
gem 'nokogiri'
|
15
|
+
gem 'spreadsheet'
|
13
16
|
```
|
14
17
|
|
15
18
|
And then execute:
|
@@ -22,7 +25,13 @@ Or install it yourself as:
|
|
22
25
|
|
23
26
|
## Usage
|
24
27
|
|
25
|
-
|
28
|
+
```ruby
|
29
|
+
require 'tomosia_amanaplus_crawl'
|
30
|
+
TomosiaAmanaplusCrawl::Crawler.new.run(keyword, path, max)
|
31
|
+
```
|
32
|
+
keyword: hoian, danang, ...
|
33
|
+
path: './', '/desktop/', ...
|
34
|
+
max: số lượng ảnh muốn lấy về. Nếu max lớn hơn tổng số ảnh các page thì vẫn lấy hết tất cả ảnh
|
26
35
|
|
27
36
|
## Development
|
28
37
|
|
@@ -10,21 +10,29 @@ module TomosiaAmanaplusCrawl
|
|
10
10
|
class Crawler
|
11
11
|
URL = "https://plus.amanaimages.com/items/search/"
|
12
12
|
|
13
|
-
def run(keyword, destination)
|
13
|
+
def run(keyword, destination, max)
|
14
14
|
unparsed_page = HTTParty.get("#{URL}/#{keyword}")
|
15
15
|
parsed_page = Nokogiri::HTML(unparsed_page)
|
16
16
|
|
17
17
|
pages = parsed_page.css("div.c-paginate__nums").css('a').last.text.to_i # tổng số page
|
18
18
|
images_listings = parsed_page.css("div.p-search-result__body") # danh sách các thẻ div chứa image
|
19
|
+
|
20
|
+
# lấy tổng số image
|
21
|
+
total = parsed_page.css("h1.p-search-result__ttl").text.split(' ').first
|
22
|
+
total = total[11..(total.length - 1)].chop.chop.chop.sub(',', '').to_i
|
23
|
+
if max > total # nếu max lớn hơn total thì max = total => vẫn lấy hết
|
24
|
+
max = total
|
25
|
+
end
|
19
26
|
|
20
|
-
images = getPaginationImages(images_listings, pages, keyword)
|
27
|
+
images = getPaginationImages(images_listings, pages, keyword, max)
|
21
28
|
downloadImages(images, destination)
|
22
29
|
writeToExcel(images, destination)
|
23
30
|
end
|
24
31
|
|
25
|
-
def getPaginationImages(images_listings, pages, keyword) # lấy tất cả image của các page cộng lại
|
32
|
+
def getPaginationImages(images_listings, pages, keyword, max) # lấy tất cả image của các page cộng lại
|
26
33
|
images = Array.new
|
27
34
|
curr_page = 1
|
35
|
+
curr_index = 1
|
28
36
|
while curr_page <= pages
|
29
37
|
puts "Crawling page #{curr_page}..........."
|
30
38
|
|
@@ -33,6 +41,10 @@ module TomosiaAmanaplusCrawl
|
|
33
41
|
pagination_images_listings = pagination_parsed_page.css("div.p-item-thumb")
|
34
42
|
|
35
43
|
pagination_images_listings.each do |img|
|
44
|
+
if curr_index > max
|
45
|
+
return images
|
46
|
+
end
|
47
|
+
|
36
48
|
src = img.css('img').attr('data-src').nil? == true ? img.css('img').attr('src') : img.css('img').attr('data-src')
|
37
49
|
current_image = {
|
38
50
|
title: img.css('a')[1].attr('title'),
|
@@ -41,6 +53,7 @@ module TomosiaAmanaplusCrawl
|
|
41
53
|
extension: ".#{src.to_s.split('.').last}"
|
42
54
|
}
|
43
55
|
images << current_image
|
56
|
+
curr_index += 1
|
44
57
|
end
|
45
58
|
|
46
59
|
curr_page += 1
|