tomosia_amanaplus_crawl 0.1.0 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 12228835df204c09456c99bdd9ff297d72696872ed2b90af6cbef34b0e55b293
4
- data.tar.gz: e5e2390aa64bb8e32ef48ac9f4f3ff51d8baa20a01da42126b1f26efb1902eb3
3
+ metadata.gz: 7e09b5361e7b84f880485d64480eca862a19dcbd32c0cf4a0bab3160c3b5ab11
4
+ data.tar.gz: fc69ac8dcb7e3ba4c74412bb4760ef2cd996831d572eb63f184624330d8789a8
5
5
  SHA512:
6
- metadata.gz: e160b67d73d2bafadfa57c3e492868f10bfd3ce6e8232a065faa80d9563f9d333ac9a16792c86f36c9cc0e80b344fc0c8c18e1076cf26803fb73478b4d8fe5d2
7
- data.tar.gz: 8b415f28cf91e873ebad4cdb94faca4cc231db680436294e7b7a01b31025bd9f1f60a404a420fac1897078b62693af1306cf101b1676c1df8bdf695bf35ac9d3
6
+ metadata.gz: 8ef2cbc7fa812b63ff04e136dcc5ff479fede20b7489485b83ce5dc84c5642fe3eba846bfc8a14c335b7c9271acf4bbcf2a6e0d4e2e557302488359306e820f4
7
+ data.tar.gz: 414a5491c0227c9f43830a11384e90b55305b0cdcb14d1b963c60c36aa278b748631296ad4bee837ed05184c490df8f747a0cc239025c6bd12d831ddb3ecb704
data/Gemfile CHANGED
@@ -5,7 +5,3 @@ gemspec
5
5
 
6
6
  gem "rake", "~> 12.0"
7
7
  gem "rspec", "~> 3.0"
8
- gem 'httparty'
9
- gem 'nokogiri'
10
- gem 'spreadsheet'
11
- gem 'byebug'
@@ -0,0 +1,50 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ tomosia_amanaplus_crawl (0.1.5)
5
+ httparty (= 0.18.1)
6
+ nokogiri (= 1.10.10)
7
+ spreadsheet (= 1.2.6)
8
+
9
+ GEM
10
+ remote: https://rubygems.org/
11
+ specs:
12
+ diff-lcs (1.4.4)
13
+ httparty (0.18.1)
14
+ mime-types (~> 3.0)
15
+ multi_xml (>= 0.5.2)
16
+ mime-types (3.3.1)
17
+ mime-types-data (~> 3.2015)
18
+ mime-types-data (3.2020.0512)
19
+ mini_portile2 (2.4.0)
20
+ multi_xml (0.6.0)
21
+ nokogiri (1.10.10)
22
+ mini_portile2 (~> 2.4.0)
23
+ rake (12.3.3)
24
+ rspec (3.9.0)
25
+ rspec-core (~> 3.9.0)
26
+ rspec-expectations (~> 3.9.0)
27
+ rspec-mocks (~> 3.9.0)
28
+ rspec-core (3.9.2)
29
+ rspec-support (~> 3.9.3)
30
+ rspec-expectations (3.9.2)
31
+ diff-lcs (>= 1.2.0, < 2.0)
32
+ rspec-support (~> 3.9.0)
33
+ rspec-mocks (3.9.1)
34
+ diff-lcs (>= 1.2.0, < 2.0)
35
+ rspec-support (~> 3.9.0)
36
+ rspec-support (3.9.3)
37
+ ruby-ole (1.2.12.2)
38
+ spreadsheet (1.2.6)
39
+ ruby-ole (>= 1.0)
40
+
41
+ PLATFORMS
42
+ ruby
43
+
44
+ DEPENDENCIES
45
+ rake (~> 12.0)
46
+ rspec (~> 3.0)
47
+ tomosia_amanaplus_crawl!
48
+
49
+ BUNDLED WITH
50
+ 2.1.4
data/README.md CHANGED
@@ -10,6 +10,9 @@ Add this line to your application's Gemfile:
10
10
 
11
11
  ```ruby
12
12
  gem 'tomosia_amanaplus_crawl'
13
+ gem 'httparty'
14
+ gem 'nokogiri'
15
+ gem 'spreadsheet'
13
16
  ```
14
17
 
15
18
  And then execute:
@@ -22,7 +25,13 @@ Or install it yourself as:
22
25
 
23
26
  ## Usage
24
27
 
25
- TODO: Write usage instructions here
28
+ ```ruby
29
+ require 'tomosia_amanaplus_crawl'
30
+ TomosiaAmanaplusCrawl::Crawler.new.run(keyword, path, max)
31
+ ```
32
+ keyword: hoian, danang, ...
33
+ path: './', '/desktop/', ...
34
+ max: số lượng ảnh muốn lấy về. Nếu max lớn hơn tổng số ảnh các page thì vẫn lấy hết tất cả ảnh
26
35
 
27
36
  ## Development
28
37
 
@@ -1,30 +1,38 @@
1
1
  require "tomosia_amanaplus_crawl/version"
2
- require 'nokogiri'
3
- require 'httparty'
4
- require 'open-uri'
5
- require 'fileutils'
6
- require 'spreadsheet'
7
2
 
8
3
  module TomosiaAmanaplusCrawl
4
+ require 'nokogiri'
5
+ require 'httparty'
6
+ require 'open-uri'
7
+ require 'fileutils'
8
+ require 'spreadsheet'
9
+
9
10
  class Crawler
10
11
  URL = "https://plus.amanaimages.com/items/search/"
11
12
 
12
- def run(keyword, destination)
13
+ def run(keyword, destination, max)
13
14
  unparsed_page = HTTParty.get("#{URL}/#{keyword}")
14
15
  parsed_page = Nokogiri::HTML(unparsed_page)
15
16
 
16
17
  pages = parsed_page.css("div.c-paginate__nums").css('a').last.text.to_i # tổng số page
17
18
  images_listings = parsed_page.css("div.p-search-result__body") # danh sách các thẻ div chứa image
19
+
20
+ # lấy tổng số image
21
+ total = parsed_page.css("h1.p-search-result__ttl").text.split(' ').first
22
+ total = total[11..(total.length - 1)].chop.chop.chop.sub(',', '').to_i
23
+ if max > total # nếu max lớn hơn total thì max = total => vẫn lấy hết
24
+ max = total
25
+ end
18
26
 
19
- images = getPaginationImages(images_listings, pages, keyword)
27
+ images = getPaginationImages(images_listings, pages, keyword, max)
20
28
  downloadImages(images, destination)
21
29
  writeToExcel(images, destination)
22
30
  end
23
31
 
24
- def getPaginationImages(images_listings, pages, keyword) # lấy tất cả image của các page cộng lại
32
+ def getPaginationImages(images_listings, pages, keyword, max) # lấy tất cả image của các page cộng lại
25
33
  images = Array.new
26
- i = 0
27
34
  curr_page = 1
35
+ curr_index = 1
28
36
  while curr_page <= pages
29
37
  puts "Crawling page #{curr_page}..........."
30
38
 
@@ -33,6 +41,10 @@ module TomosiaAmanaplusCrawl
33
41
  pagination_images_listings = pagination_parsed_page.css("div.p-item-thumb")
34
42
 
35
43
  pagination_images_listings.each do |img|
44
+ if curr_index > max
45
+ return images
46
+ end
47
+
36
48
  src = img.css('img').attr('data-src').nil? == true ? img.css('img').attr('src') : img.css('img').attr('data-src')
37
49
  current_image = {
38
50
  title: img.css('a')[1].attr('title'),
@@ -41,7 +53,7 @@ module TomosiaAmanaplusCrawl
41
53
  extension: ".#{src.to_s.split('.').last}"
42
54
  }
43
55
  images << current_image
44
- # puts "#{i += 1}: #{src}"
56
+ curr_index += 1
45
57
  end
46
58
 
47
59
  curr_page += 1
@@ -1,3 +1,3 @@
1
1
  module TomosiaAmanaplusCrawl
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.5"
3
3
  end
@@ -4,26 +4,18 @@ Gem::Specification.new do |spec|
4
4
  spec.name = "tomosia_amanaplus_crawl"
5
5
  spec.version = TomosiaAmanaplusCrawl::VERSION
6
6
  spec.authors = "Nhat Huy"
7
- spec.email = "nhathuych@tomosia.com"
8
7
 
9
8
  spec.summary = %q{tomosia_amanaplus_crawl demo project crawl du lieu.}
10
- spec.description = %q{tomosia_amanaplus_crawl demo project crawl du lieu.}
11
9
  spec.homepage = "https://github.com/tthuydang/tomosia_amanaplus_crawl"
12
- spec.license = "MIT"
13
10
  spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
14
11
 
15
- # spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
16
-
17
- # spec.metadata["homepage_uri"] = spec.homepage
18
- # spec.metadata["source_code_uri"] = "TODO: Put your gem's public repo URL here."
19
- # spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
20
-
21
- # Specify which files should be added to the gem when it is released.
22
- # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
23
12
  spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
24
13
  `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
25
14
  end
26
- spec.bindir = "exe"
27
15
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
28
16
  spec.require_paths = ["lib"]
17
+
18
+ spec.add_runtime_dependency('httparty', '0.18.1')
19
+ spec.add_runtime_dependency('nokogiri', '1.10.10')
20
+ spec.add_runtime_dependency('spreadsheet', '1.2.6')
29
21
  end
metadata CHANGED
@@ -1,17 +1,59 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tomosia_amanaplus_crawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nhat Huy
8
8
  autorequire:
9
- bindir: exe
9
+ bindir: bin
10
10
  cert_chain: []
11
- date: 2020-08-04 00:00:00.000000000 Z
12
- dependencies: []
13
- description: tomosia_amanaplus_crawl demo project crawl du lieu.
14
- email: nhathuych@tomosia.com
11
+ date: 2020-08-07 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: httparty
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '='
18
+ - !ruby/object:Gem::Version
19
+ version: 0.18.1
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '='
25
+ - !ruby/object:Gem::Version
26
+ version: 0.18.1
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '='
32
+ - !ruby/object:Gem::Version
33
+ version: 1.10.10
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '='
39
+ - !ruby/object:Gem::Version
40
+ version: 1.10.10
41
+ - !ruby/object:Gem::Dependency
42
+ name: spreadsheet
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '='
46
+ - !ruby/object:Gem::Version
47
+ version: 1.2.6
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '='
53
+ - !ruby/object:Gem::Version
54
+ version: 1.2.6
55
+ description:
56
+ email:
15
57
  executables: []
16
58
  extensions: []
17
59
  extra_rdoc_files: []
@@ -21,6 +63,7 @@ files:
21
63
  - ".travis.yml"
22
64
  - CODE_OF_CONDUCT.md
23
65
  - Gemfile
66
+ - Gemfile.lock
24
67
  - LICENSE.txt
25
68
  - README.md
26
69
  - Rakefile
@@ -30,8 +73,7 @@ files:
30
73
  - lib/tomosia_amanaplus_crawl/version.rb
31
74
  - tomosia_amanaplus_crawl.gemspec
32
75
  homepage: https://github.com/tthuydang/tomosia_amanaplus_crawl
33
- licenses:
34
- - MIT
76
+ licenses: []
35
77
  metadata: {}
36
78
  post_install_message:
37
79
  rdoc_options: []