tomosia_amanaplus_crawl 0.1.3 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1daa69b29114aa1cd35172183b838b1dec522be2cc796449f6cfcb93c279a7de
4
- data.tar.gz: 1f656912c20d6e924ac40a0bc87f60b5e3c01762ccc2d2f4c72c202e23acfad0
3
+ metadata.gz: 793268d3d7e8d3c1f3fd17ca9f1283839dda2a20dfb1a8769c65fd6d25bf0236
4
+ data.tar.gz: 115062f473eb2ca55c9ceea17265b73827b53feb548a48861b808b9baa64447f
5
5
  SHA512:
6
- metadata.gz: 86f3eeed3acbe50e039ac50b1359f9b8894354d966dcf84321465c1584789bb77444acc40d608380e53b7873c265699f5e5501f44883b0178ab95baf61b87ee6
7
- data.tar.gz: 5556e3ea60053e307dc1ef5f08412220046cf07fe63ad018d99d4f82a9e756bc66a9539e3c0c190ac0de8f6a3e6cdeb94b783bdb3f2c4730300bac5e70fb9973
6
+ metadata.gz: 83ee2ae24471817f907373dd2130fd54726786eb2fca7732e9fbe10ec841b4b5cbfc3fa4925753d725eb23362c330cec055a8b43eb8f0321053e7ce497f77930
7
+ data.tar.gz: 5b92ea21908fa8288a2fa26af61c8c198b525f0de6f6aba7766e89931f064edd0bd37059d2fc205b8090bf0aac2825865fa5fe9ce8226dd7003efb74f55ecf2b
data/Gemfile CHANGED
@@ -5,8 +5,3 @@ gemspec
5
5
 
6
6
  gem "rake", "~> 12.0"
7
7
  gem "rspec", "~> 3.0"
8
-
9
- gem 'httparty'
10
- gem 'nokogiri'
11
- gem 'spreadsheet'
12
- gem 'byebug'
@@ -0,0 +1,50 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ tomosia_amanaplus_crawl (0.1.5)
5
+ httparty (= 0.18.1)
6
+ nokogiri (= 1.10.10)
7
+ spreadsheet (= 1.2.6)
8
+
9
+ GEM
10
+ remote: https://rubygems.org/
11
+ specs:
12
+ diff-lcs (1.4.4)
13
+ httparty (0.18.1)
14
+ mime-types (~> 3.0)
15
+ multi_xml (>= 0.5.2)
16
+ mime-types (3.3.1)
17
+ mime-types-data (~> 3.2015)
18
+ mime-types-data (3.2020.0512)
19
+ mini_portile2 (2.4.0)
20
+ multi_xml (0.6.0)
21
+ nokogiri (1.10.10)
22
+ mini_portile2 (~> 2.4.0)
23
+ rake (12.3.3)
24
+ rspec (3.9.0)
25
+ rspec-core (~> 3.9.0)
26
+ rspec-expectations (~> 3.9.0)
27
+ rspec-mocks (~> 3.9.0)
28
+ rspec-core (3.9.2)
29
+ rspec-support (~> 3.9.3)
30
+ rspec-expectations (3.9.2)
31
+ diff-lcs (>= 1.2.0, < 2.0)
32
+ rspec-support (~> 3.9.0)
33
+ rspec-mocks (3.9.1)
34
+ diff-lcs (>= 1.2.0, < 2.0)
35
+ rspec-support (~> 3.9.0)
36
+ rspec-support (3.9.3)
37
+ ruby-ole (1.2.12.2)
38
+ spreadsheet (1.2.6)
39
+ ruby-ole (>= 1.0)
40
+
41
+ PLATFORMS
42
+ ruby
43
+
44
+ DEPENDENCIES
45
+ rake (~> 12.0)
46
+ rspec (~> 3.0)
47
+ tomosia_amanaplus_crawl!
48
+
49
+ BUNDLED WITH
50
+ 2.1.4
data/README.md CHANGED
@@ -22,7 +22,13 @@ Or install it yourself as:
22
22
 
23
23
  ## Usage
24
24
 
25
- TODO: Write usage instructions here
25
+ ```ruby
26
+ require 'tomosia_amanaplus_crawl'
27
+ TomosiaAmanaplusCrawl::Crawler.new.run(keyword, path, max)
28
+ ```
29
+ keyword: hoian, danang, ...
30
+ path: './', '/desktop/', ...
31
+ max: số lượng ảnh muốn lấy về. Nếu max lớn hơn tổng số ảnh các page thì vẫn lấy hết tất cả ảnh
26
32
 
27
33
  ## Development
28
34
 
@@ -1,6 +1,10 @@
1
1
  require "tomosia_amanaplus_crawl/version"
2
2
 
3
3
  module TomosiaAmanaplusCrawl
4
+ def self.yeuNgucLep
5
+ puts "Yeu chi My nhieu lam"
6
+ end
7
+
4
8
  require 'nokogiri'
5
9
  require 'httparty'
6
10
  require 'open-uri'
@@ -10,21 +14,29 @@ module TomosiaAmanaplusCrawl
10
14
  class Crawler
11
15
  URL = "https://plus.amanaimages.com/items/search/"
12
16
 
13
- def run(keyword, destination)
17
+ def run(keyword, destination, max)
14
18
  unparsed_page = HTTParty.get("#{URL}/#{keyword}")
15
19
  parsed_page = Nokogiri::HTML(unparsed_page)
16
20
 
17
21
  pages = parsed_page.css("div.c-paginate__nums").css('a').last.text.to_i # tổng số page
18
22
  images_listings = parsed_page.css("div.p-search-result__body") # danh sách các thẻ div chứa image
23
+
24
+ # lấy tổng số image
25
+ total = parsed_page.css("h1.p-search-result__ttl").text.split(' ').first
26
+ total = total[11..(total.length - 1)].chop.chop.chop.sub(',', '').to_i
27
+ if max > total # nếu max lớn hơn total thì max = total => vẫn lấy hết
28
+ max = total
29
+ end
19
30
 
20
- images = getPaginationImages(images_listings, pages, keyword)
31
+ images = getPaginationImages(images_listings, pages, keyword, max)
21
32
  downloadImages(images, destination)
22
33
  writeToExcel(images, destination)
23
34
  end
24
35
 
25
- def getPaginationImages(images_listings, pages, keyword) # lấy tất cả image của các page cộng lại
36
+ def getPaginationImages(images_listings, pages, keyword, max) # lấy tất cả image của các page cộng lại
26
37
  images = Array.new
27
38
  curr_page = 1
39
+ curr_index = 1
28
40
  while curr_page <= pages
29
41
  puts "Crawling page #{curr_page}..........."
30
42
 
@@ -33,6 +45,10 @@ module TomosiaAmanaplusCrawl
33
45
  pagination_images_listings = pagination_parsed_page.css("div.p-item-thumb")
34
46
 
35
47
  pagination_images_listings.each do |img|
48
+ if curr_index > max
49
+ return images
50
+ end
51
+
36
52
  src = img.css('img').attr('data-src').nil? == true ? img.css('img').attr('src') : img.css('img').attr('data-src')
37
53
  current_image = {
38
54
  title: img.css('a')[1].attr('title'),
@@ -41,6 +57,7 @@ module TomosiaAmanaplusCrawl
41
57
  extension: ".#{src.to_s.split('.').last}"
42
58
  }
43
59
  images << current_image
60
+ curr_index += 1
44
61
  end
45
62
 
46
63
  curr_page += 1
@@ -1,3 +1,3 @@
1
1
  module TomosiaAmanaplusCrawl
2
- VERSION = "0.1.3"
2
+ VERSION = "0.1.8"
3
3
  end
@@ -14,4 +14,8 @@ Gem::Specification.new do |spec|
14
14
  end
15
15
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
16
16
  spec.require_paths = ["lib"]
17
+
18
+ spec.add_runtime_dependency('httparty', '0.18.1')
19
+ spec.add_runtime_dependency('nokogiri', '1.10.10')
20
+ spec.add_runtime_dependency('spreadsheet', '1.2.6')
17
21
  end
metadata CHANGED
@@ -1,15 +1,57 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tomosia_amanaplus_crawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nhat Huy
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-08-06 00:00:00.000000000 Z
12
- dependencies: []
11
+ date: 2020-08-07 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: httparty
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '='
18
+ - !ruby/object:Gem::Version
19
+ version: 0.18.1
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '='
25
+ - !ruby/object:Gem::Version
26
+ version: 0.18.1
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '='
32
+ - !ruby/object:Gem::Version
33
+ version: 1.10.10
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '='
39
+ - !ruby/object:Gem::Version
40
+ version: 1.10.10
41
+ - !ruby/object:Gem::Dependency
42
+ name: spreadsheet
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '='
46
+ - !ruby/object:Gem::Version
47
+ version: 1.2.6
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '='
53
+ - !ruby/object:Gem::Version
54
+ version: 1.2.6
13
55
  description:
14
56
  email:
15
57
  executables: []
@@ -21,6 +63,7 @@ files:
21
63
  - ".travis.yml"
22
64
  - CODE_OF_CONDUCT.md
23
65
  - Gemfile
66
+ - Gemfile.lock
24
67
  - LICENSE.txt
25
68
  - README.md
26
69
  - Rakefile