tomosia_amanaplus_crawl 0.1.3 → 0.1.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1daa69b29114aa1cd35172183b838b1dec522be2cc796449f6cfcb93c279a7de
4
- data.tar.gz: 1f656912c20d6e924ac40a0bc87f60b5e3c01762ccc2d2f4c72c202e23acfad0
3
+ metadata.gz: 793268d3d7e8d3c1f3fd17ca9f1283839dda2a20dfb1a8769c65fd6d25bf0236
4
+ data.tar.gz: 115062f473eb2ca55c9ceea17265b73827b53feb548a48861b808b9baa64447f
5
5
  SHA512:
6
- metadata.gz: 86f3eeed3acbe50e039ac50b1359f9b8894354d966dcf84321465c1584789bb77444acc40d608380e53b7873c265699f5e5501f44883b0178ab95baf61b87ee6
7
- data.tar.gz: 5556e3ea60053e307dc1ef5f08412220046cf07fe63ad018d99d4f82a9e756bc66a9539e3c0c190ac0de8f6a3e6cdeb94b783bdb3f2c4730300bac5e70fb9973
6
+ metadata.gz: 83ee2ae24471817f907373dd2130fd54726786eb2fca7732e9fbe10ec841b4b5cbfc3fa4925753d725eb23362c330cec055a8b43eb8f0321053e7ce497f77930
7
+ data.tar.gz: 5b92ea21908fa8288a2fa26af61c8c198b525f0de6f6aba7766e89931f064edd0bd37059d2fc205b8090bf0aac2825865fa5fe9ce8226dd7003efb74f55ecf2b
data/Gemfile CHANGED
@@ -5,8 +5,3 @@ gemspec
5
5
 
6
6
  gem "rake", "~> 12.0"
7
7
  gem "rspec", "~> 3.0"
8
-
9
- gem 'httparty'
10
- gem 'nokogiri'
11
- gem 'spreadsheet'
12
- gem 'byebug'
@@ -0,0 +1,50 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ tomosia_amanaplus_crawl (0.1.5)
5
+ httparty (= 0.18.1)
6
+ nokogiri (= 1.10.10)
7
+ spreadsheet (= 1.2.6)
8
+
9
+ GEM
10
+ remote: https://rubygems.org/
11
+ specs:
12
+ diff-lcs (1.4.4)
13
+ httparty (0.18.1)
14
+ mime-types (~> 3.0)
15
+ multi_xml (>= 0.5.2)
16
+ mime-types (3.3.1)
17
+ mime-types-data (~> 3.2015)
18
+ mime-types-data (3.2020.0512)
19
+ mini_portile2 (2.4.0)
20
+ multi_xml (0.6.0)
21
+ nokogiri (1.10.10)
22
+ mini_portile2 (~> 2.4.0)
23
+ rake (12.3.3)
24
+ rspec (3.9.0)
25
+ rspec-core (~> 3.9.0)
26
+ rspec-expectations (~> 3.9.0)
27
+ rspec-mocks (~> 3.9.0)
28
+ rspec-core (3.9.2)
29
+ rspec-support (~> 3.9.3)
30
+ rspec-expectations (3.9.2)
31
+ diff-lcs (>= 1.2.0, < 2.0)
32
+ rspec-support (~> 3.9.0)
33
+ rspec-mocks (3.9.1)
34
+ diff-lcs (>= 1.2.0, < 2.0)
35
+ rspec-support (~> 3.9.0)
36
+ rspec-support (3.9.3)
37
+ ruby-ole (1.2.12.2)
38
+ spreadsheet (1.2.6)
39
+ ruby-ole (>= 1.0)
40
+
41
+ PLATFORMS
42
+ ruby
43
+
44
+ DEPENDENCIES
45
+ rake (~> 12.0)
46
+ rspec (~> 3.0)
47
+ tomosia_amanaplus_crawl!
48
+
49
+ BUNDLED WITH
50
+ 2.1.4
data/README.md CHANGED
@@ -22,7 +22,13 @@ Or install it yourself as:
22
22
 
23
23
  ## Usage
24
24
 
25
- TODO: Write usage instructions here
25
+ ```ruby
26
+ require 'tomosia_amanaplus_crawl'
27
+ TomosiaAmanaplusCrawl::Crawler.new.run(keyword, path, max)
28
+ ```
29
+ keyword: hoian, danang, ...
30
+ path: './', '/desktop/', ...
31
+ max: số lượng ảnh muốn lấy về. Nếu max lớn hơn tổng số ảnh các page thì vẫn lấy hết tất cả ảnh
26
32
 
27
33
  ## Development
28
34
 
@@ -1,6 +1,10 @@
1
1
  require "tomosia_amanaplus_crawl/version"
2
2
 
3
3
  module TomosiaAmanaplusCrawl
4
+ def self.yeuNgucLep
5
+ puts "Yeu chi My nhieu lam"
6
+ end
7
+
4
8
  require 'nokogiri'
5
9
  require 'httparty'
6
10
  require 'open-uri'
@@ -10,21 +14,29 @@ module TomosiaAmanaplusCrawl
10
14
  class Crawler
11
15
  URL = "https://plus.amanaimages.com/items/search/"
12
16
 
13
- def run(keyword, destination)
17
+ def run(keyword, destination, max)
14
18
  unparsed_page = HTTParty.get("#{URL}/#{keyword}")
15
19
  parsed_page = Nokogiri::HTML(unparsed_page)
16
20
 
17
21
  pages = parsed_page.css("div.c-paginate__nums").css('a').last.text.to_i # tổng số page
18
22
  images_listings = parsed_page.css("div.p-search-result__body") # danh sách các thẻ div chứa image
23
+
24
+ # lấy tổng số image
25
+ total = parsed_page.css("h1.p-search-result__ttl").text.split(' ').first
26
+ total = total[11..(total.length - 1)].chop.chop.chop.sub(',', '').to_i
27
+ if max > total # nếu max lớn hơn total thì max = total => vẫn lấy hết
28
+ max = total
29
+ end
19
30
 
20
- images = getPaginationImages(images_listings, pages, keyword)
31
+ images = getPaginationImages(images_listings, pages, keyword, max)
21
32
  downloadImages(images, destination)
22
33
  writeToExcel(images, destination)
23
34
  end
24
35
 
25
- def getPaginationImages(images_listings, pages, keyword) # lấy tất cả image của các page cộng lại
36
+ def getPaginationImages(images_listings, pages, keyword, max) # lấy tất cả image của các page cộng lại
26
37
  images = Array.new
27
38
  curr_page = 1
39
+ curr_index = 1
28
40
  while curr_page <= pages
29
41
  puts "Crawling page #{curr_page}..........."
30
42
 
@@ -33,6 +45,10 @@ module TomosiaAmanaplusCrawl
33
45
  pagination_images_listings = pagination_parsed_page.css("div.p-item-thumb")
34
46
 
35
47
  pagination_images_listings.each do |img|
48
+ if curr_index > max
49
+ return images
50
+ end
51
+
36
52
  src = img.css('img').attr('data-src').nil? == true ? img.css('img').attr('src') : img.css('img').attr('data-src')
37
53
  current_image = {
38
54
  title: img.css('a')[1].attr('title'),
@@ -41,6 +57,7 @@ module TomosiaAmanaplusCrawl
41
57
  extension: ".#{src.to_s.split('.').last}"
42
58
  }
43
59
  images << current_image
60
+ curr_index += 1
44
61
  end
45
62
 
46
63
  curr_page += 1
@@ -1,3 +1,3 @@
1
1
  module TomosiaAmanaplusCrawl
2
- VERSION = "0.1.3"
2
+ VERSION = "0.1.8"
3
3
  end
@@ -14,4 +14,8 @@ Gem::Specification.new do |spec|
14
14
  end
15
15
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
16
16
  spec.require_paths = ["lib"]
17
+
18
+ spec.add_runtime_dependency('httparty', '0.18.1')
19
+ spec.add_runtime_dependency('nokogiri', '1.10.10')
20
+ spec.add_runtime_dependency('spreadsheet', '1.2.6')
17
21
  end
metadata CHANGED
@@ -1,15 +1,57 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tomosia_amanaplus_crawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nhat Huy
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-08-06 00:00:00.000000000 Z
12
- dependencies: []
11
+ date: 2020-08-07 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: httparty
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '='
18
+ - !ruby/object:Gem::Version
19
+ version: 0.18.1
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '='
25
+ - !ruby/object:Gem::Version
26
+ version: 0.18.1
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '='
32
+ - !ruby/object:Gem::Version
33
+ version: 1.10.10
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '='
39
+ - !ruby/object:Gem::Version
40
+ version: 1.10.10
41
+ - !ruby/object:Gem::Dependency
42
+ name: spreadsheet
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '='
46
+ - !ruby/object:Gem::Version
47
+ version: 1.2.6
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '='
53
+ - !ruby/object:Gem::Version
54
+ version: 1.2.6
13
55
  description:
14
56
  email:
15
57
  executables: []
@@ -21,6 +63,7 @@ files:
21
63
  - ".travis.yml"
22
64
  - CODE_OF_CONDUCT.md
23
65
  - Gemfile
66
+ - Gemfile.lock
24
67
  - LICENSE.txt
25
68
  - README.md
26
69
  - Rakefile