tomosia_amanaplus_crawl 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1daa69b29114aa1cd35172183b838b1dec522be2cc796449f6cfcb93c279a7de
4
- data.tar.gz: 1f656912c20d6e924ac40a0bc87f60b5e3c01762ccc2d2f4c72c202e23acfad0
3
+ metadata.gz: 2832ce228e9bc2ebcce58c1dde236cc0f6bfa4c789654e976cb66defe368fb39
4
+ data.tar.gz: 790070002318d2d3c3727b9fd3986b655624579d3130e32e0ba6c88bd65485cd
5
5
  SHA512:
6
- metadata.gz: 86f3eeed3acbe50e039ac50b1359f9b8894354d966dcf84321465c1584789bb77444acc40d608380e53b7873c265699f5e5501f44883b0178ab95baf61b87ee6
7
- data.tar.gz: 5556e3ea60053e307dc1ef5f08412220046cf07fe63ad018d99d4f82a9e756bc66a9539e3c0c190ac0de8f6a3e6cdeb94b783bdb3f2c4730300bac5e70fb9973
6
+ metadata.gz: 8b174ec110e8933aae815abef670ae0a33ae69077b98268522aaf83a52702f7d3c303cc9bbf73d5e8ec7d27a9cb2deb1cd1d05e6edfc9b5ecff8fa94b49af4d9
7
+ data.tar.gz: badfef5587e0b7989a313caf4490f59efba61d9e743f16126ae1bd7915abd7d878be1467bbcef967c4968c590c7c4383f542c0a8a51f5bf07783759ae4b216da
data/Gemfile CHANGED
@@ -9,4 +9,3 @@ gem "rspec", "~> 3.0"
9
9
  gem 'httparty'
10
10
  gem 'nokogiri'
11
11
  gem 'spreadsheet'
12
- gem 'byebug'
data/README.md CHANGED
@@ -10,6 +10,9 @@ Add this line to your application's Gemfile:
10
10
 
11
11
  ```ruby
12
12
  gem 'tomosia_amanaplus_crawl'
13
+ gem 'httparty'
14
+ gem 'nokogiri'
15
+ gem 'spreadsheet'
13
16
  ```
14
17
 
15
18
  And then execute:
@@ -22,7 +25,13 @@ Or install it yourself as:
22
25
 
23
26
  ## Usage
24
27
 
25
- TODO: Write usage instructions here
28
+ ```ruby
29
+ require 'tomosia_amanaplus_crawl'
30
+ TomosiaAmanaplusCrawl::Crawler.new.run(keyword, path, max)
31
+ ```
32
+ keyword: hoian, danang, ...
33
+ path: './', '/desktop/', ...
34
+ max: số lượng ảnh muốn lấy về. Nếu max lớn hơn tổng số ảnh các page thì vẫn lấy hết tất cả ảnh
26
35
 
27
36
  ## Development
28
37
 
@@ -10,21 +10,29 @@ module TomosiaAmanaplusCrawl
10
10
  class Crawler
11
11
  URL = "https://plus.amanaimages.com/items/search/"
12
12
 
13
- def run(keyword, destination)
13
+ def run(keyword, destination, max)
14
14
  unparsed_page = HTTParty.get("#{URL}/#{keyword}")
15
15
  parsed_page = Nokogiri::HTML(unparsed_page)
16
16
 
17
17
  pages = parsed_page.css("div.c-paginate__nums").css('a').last.text.to_i # tổng số page
18
18
  images_listings = parsed_page.css("div.p-search-result__body") # danh sách các thẻ div chứa image
19
+
20
+ # lấy tổng số image
21
+ total = parsed_page.css("h1.p-search-result__ttl").text.split(' ').first
22
+ total = total[11..(total.length - 1)].chop.chop.chop.sub(',', '').to_i
23
+ if max > total # nếu max lớn hơn total thì max = total => vẫn lấy hết
24
+ max = total
25
+ end
19
26
 
20
- images = getPaginationImages(images_listings, pages, keyword)
27
+ images = getPaginationImages(images_listings, pages, keyword, max)
21
28
  downloadImages(images, destination)
22
29
  writeToExcel(images, destination)
23
30
  end
24
31
 
25
- def getPaginationImages(images_listings, pages, keyword) # lấy tất cả image của các page cộng lại
32
+ def getPaginationImages(images_listings, pages, keyword, max) # lấy tất cả image của các page cộng lại
26
33
  images = Array.new
27
34
  curr_page = 1
35
+ curr_index = 1
28
36
  while curr_page <= pages
29
37
  puts "Crawling page #{curr_page}..........."
30
38
 
@@ -33,6 +41,10 @@ module TomosiaAmanaplusCrawl
33
41
  pagination_images_listings = pagination_parsed_page.css("div.p-item-thumb")
34
42
 
35
43
  pagination_images_listings.each do |img|
44
+ if curr_index > max
45
+ return images
46
+ end
47
+
36
48
  src = img.css('img').attr('data-src').nil? == true ? img.css('img').attr('src') : img.css('img').attr('data-src')
37
49
  current_image = {
38
50
  title: img.css('a')[1].attr('title'),
@@ -41,6 +53,7 @@ module TomosiaAmanaplusCrawl
41
53
  extension: ".#{src.to_s.split('.').last}"
42
54
  }
43
55
  images << current_image
56
+ curr_index += 1
44
57
  end
45
58
 
46
59
  curr_page += 1
@@ -1,3 +1,3 @@
1
1
  module TomosiaAmanaplusCrawl
2
- VERSION = "0.1.3"
2
+ VERSION = "0.1.4"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tomosia_amanaplus_crawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nhat Huy