tomosia_amanaplus_crawl 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1daa69b29114aa1cd35172183b838b1dec522be2cc796449f6cfcb93c279a7de
4
- data.tar.gz: 1f656912c20d6e924ac40a0bc87f60b5e3c01762ccc2d2f4c72c202e23acfad0
3
+ metadata.gz: 2832ce228e9bc2ebcce58c1dde236cc0f6bfa4c789654e976cb66defe368fb39
4
+ data.tar.gz: 790070002318d2d3c3727b9fd3986b655624579d3130e32e0ba6c88bd65485cd
5
5
  SHA512:
6
- metadata.gz: 86f3eeed3acbe50e039ac50b1359f9b8894354d966dcf84321465c1584789bb77444acc40d608380e53b7873c265699f5e5501f44883b0178ab95baf61b87ee6
7
- data.tar.gz: 5556e3ea60053e307dc1ef5f08412220046cf07fe63ad018d99d4f82a9e756bc66a9539e3c0c190ac0de8f6a3e6cdeb94b783bdb3f2c4730300bac5e70fb9973
6
+ metadata.gz: 8b174ec110e8933aae815abef670ae0a33ae69077b98268522aaf83a52702f7d3c303cc9bbf73d5e8ec7d27a9cb2deb1cd1d05e6edfc9b5ecff8fa94b49af4d9
7
+ data.tar.gz: badfef5587e0b7989a313caf4490f59efba61d9e743f16126ae1bd7915abd7d878be1467bbcef967c4968c590c7c4383f542c0a8a51f5bf07783759ae4b216da
data/Gemfile CHANGED
@@ -9,4 +9,3 @@ gem "rspec", "~> 3.0"
9
9
  gem 'httparty'
10
10
  gem 'nokogiri'
11
11
  gem 'spreadsheet'
12
- gem 'byebug'
data/README.md CHANGED
@@ -10,6 +10,9 @@ Add this line to your application's Gemfile:
10
10
 
11
11
  ```ruby
12
12
  gem 'tomosia_amanaplus_crawl'
13
+ gem 'httparty'
14
+ gem 'nokogiri'
15
+ gem 'spreadsheet'
13
16
  ```
14
17
 
15
18
  And then execute:
@@ -22,7 +25,13 @@ Or install it yourself as:
22
25
 
23
26
  ## Usage
24
27
 
25
- TODO: Write usage instructions here
28
+ ```ruby
29
+ require 'tomosia_amanaplus_crawl'
30
+ TomosiaAmanaplusCrawl::Crawler.new.run(keyword, path, max)
31
+ ```
32
+ keyword: hoian, danang, ...
33
+ path: './', '/desktop/', ...
34
+ max: số lượng ảnh muốn lấy về. Nếu max lớn hơn tổng số ảnh các page thì vẫn lấy hết tất cả ảnh
26
35
 
27
36
  ## Development
28
37
 
@@ -10,21 +10,29 @@ module TomosiaAmanaplusCrawl
10
10
  class Crawler
11
11
  URL = "https://plus.amanaimages.com/items/search/"
12
12
 
13
- def run(keyword, destination)
13
+ def run(keyword, destination, max)
14
14
  unparsed_page = HTTParty.get("#{URL}/#{keyword}")
15
15
  parsed_page = Nokogiri::HTML(unparsed_page)
16
16
 
17
17
  pages = parsed_page.css("div.c-paginate__nums").css('a').last.text.to_i # tổng số page
18
18
  images_listings = parsed_page.css("div.p-search-result__body") # danh sách các thẻ div chứa image
19
+
20
+ # lấy tổng số image
21
+ total = parsed_page.css("h1.p-search-result__ttl").text.split(' ').first
22
+ total = total[11..(total.length - 1)].chop.chop.chop.sub(',', '').to_i
23
+ if max > total # nếu max lớn hơn total thì max = total => vẫn lấy hết
24
+ max = total
25
+ end
19
26
 
20
- images = getPaginationImages(images_listings, pages, keyword)
27
+ images = getPaginationImages(images_listings, pages, keyword, max)
21
28
  downloadImages(images, destination)
22
29
  writeToExcel(images, destination)
23
30
  end
24
31
 
25
- def getPaginationImages(images_listings, pages, keyword) # lấy tất cả image của các page cộng lại
32
+ def getPaginationImages(images_listings, pages, keyword, max) # lấy tất cả image của các page cộng lại
26
33
  images = Array.new
27
34
  curr_page = 1
35
+ curr_index = 1
28
36
  while curr_page <= pages
29
37
  puts "Crawling page #{curr_page}..........."
30
38
 
@@ -33,6 +41,10 @@ module TomosiaAmanaplusCrawl
33
41
  pagination_images_listings = pagination_parsed_page.css("div.p-item-thumb")
34
42
 
35
43
  pagination_images_listings.each do |img|
44
+ if curr_index > max
45
+ return images
46
+ end
47
+
36
48
  src = img.css('img').attr('data-src').nil? == true ? img.css('img').attr('src') : img.css('img').attr('data-src')
37
49
  current_image = {
38
50
  title: img.css('a')[1].attr('title'),
@@ -41,6 +53,7 @@ module TomosiaAmanaplusCrawl
41
53
  extension: ".#{src.to_s.split('.').last}"
42
54
  }
43
55
  images << current_image
56
+ curr_index += 1
44
57
  end
45
58
 
46
59
  curr_page += 1
@@ -1,3 +1,3 @@
1
1
  module TomosiaAmanaplusCrawl
2
- VERSION = "0.1.3"
2
+ VERSION = "0.1.4"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tomosia_amanaplus_crawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nhat Huy