tomosia_amanaplus_crawl 0.1.0 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 12228835df204c09456c99bdd9ff297d72696872ed2b90af6cbef34b0e55b293
4
- data.tar.gz: e5e2390aa64bb8e32ef48ac9f4f3ff51d8baa20a01da42126b1f26efb1902eb3
3
+ metadata.gz: 7e09b5361e7b84f880485d64480eca862a19dcbd32c0cf4a0bab3160c3b5ab11
4
+ data.tar.gz: fc69ac8dcb7e3ba4c74412bb4760ef2cd996831d572eb63f184624330d8789a8
5
5
  SHA512:
6
- metadata.gz: e160b67d73d2bafadfa57c3e492868f10bfd3ce6e8232a065faa80d9563f9d333ac9a16792c86f36c9cc0e80b344fc0c8c18e1076cf26803fb73478b4d8fe5d2
7
- data.tar.gz: 8b415f28cf91e873ebad4cdb94faca4cc231db680436294e7b7a01b31025bd9f1f60a404a420fac1897078b62693af1306cf101b1676c1df8bdf695bf35ac9d3
6
+ metadata.gz: 8ef2cbc7fa812b63ff04e136dcc5ff479fede20b7489485b83ce5dc84c5642fe3eba846bfc8a14c335b7c9271acf4bbcf2a6e0d4e2e557302488359306e820f4
7
+ data.tar.gz: 414a5491c0227c9f43830a11384e90b55305b0cdcb14d1b963c60c36aa278b748631296ad4bee837ed05184c490df8f747a0cc239025c6bd12d831ddb3ecb704
data/Gemfile CHANGED
@@ -5,7 +5,3 @@ gemspec
5
5
 
6
6
  gem "rake", "~> 12.0"
7
7
  gem "rspec", "~> 3.0"
8
- gem 'httparty'
9
- gem 'nokogiri'
10
- gem 'spreadsheet'
11
- gem 'byebug'
@@ -0,0 +1,50 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ tomosia_amanaplus_crawl (0.1.5)
5
+ httparty (= 0.18.1)
6
+ nokogiri (= 1.10.10)
7
+ spreadsheet (= 1.2.6)
8
+
9
+ GEM
10
+ remote: https://rubygems.org/
11
+ specs:
12
+ diff-lcs (1.4.4)
13
+ httparty (0.18.1)
14
+ mime-types (~> 3.0)
15
+ multi_xml (>= 0.5.2)
16
+ mime-types (3.3.1)
17
+ mime-types-data (~> 3.2015)
18
+ mime-types-data (3.2020.0512)
19
+ mini_portile2 (2.4.0)
20
+ multi_xml (0.6.0)
21
+ nokogiri (1.10.10)
22
+ mini_portile2 (~> 2.4.0)
23
+ rake (12.3.3)
24
+ rspec (3.9.0)
25
+ rspec-core (~> 3.9.0)
26
+ rspec-expectations (~> 3.9.0)
27
+ rspec-mocks (~> 3.9.0)
28
+ rspec-core (3.9.2)
29
+ rspec-support (~> 3.9.3)
30
+ rspec-expectations (3.9.2)
31
+ diff-lcs (>= 1.2.0, < 2.0)
32
+ rspec-support (~> 3.9.0)
33
+ rspec-mocks (3.9.1)
34
+ diff-lcs (>= 1.2.0, < 2.0)
35
+ rspec-support (~> 3.9.0)
36
+ rspec-support (3.9.3)
37
+ ruby-ole (1.2.12.2)
38
+ spreadsheet (1.2.6)
39
+ ruby-ole (>= 1.0)
40
+
41
+ PLATFORMS
42
+ ruby
43
+
44
+ DEPENDENCIES
45
+ rake (~> 12.0)
46
+ rspec (~> 3.0)
47
+ tomosia_amanaplus_crawl!
48
+
49
+ BUNDLED WITH
50
+ 2.1.4
data/README.md CHANGED
@@ -10,6 +10,9 @@ Add this line to your application's Gemfile:
10
10
 
11
11
  ```ruby
12
12
  gem 'tomosia_amanaplus_crawl'
13
+ gem 'httparty'
14
+ gem 'nokogiri'
15
+ gem 'spreadsheet'
13
16
  ```
14
17
 
15
18
  And then execute:
@@ -22,7 +25,13 @@ Or install it yourself as:
22
25
 
23
26
  ## Usage
24
27
 
25
- TODO: Write usage instructions here
28
+ ```ruby
29
+ require 'tomosia_amanaplus_crawl'
30
+ TomosiaAmanaplusCrawl::Crawler.new.run(keyword, path, max)
31
+ ```
32
+ keyword: hoian, danang, ...
33
+ path: './', '/desktop/', ...
34
+ max: số lượng ảnh muốn lấy về. Nếu max lớn hơn tổng số ảnh các page thì vẫn lấy hết tất cả ảnh
26
35
 
27
36
  ## Development
28
37
 
@@ -1,30 +1,38 @@
1
1
  require "tomosia_amanaplus_crawl/version"
2
- require 'nokogiri'
3
- require 'httparty'
4
- require 'open-uri'
5
- require 'fileutils'
6
- require 'spreadsheet'
7
2
 
8
3
  module TomosiaAmanaplusCrawl
4
+ require 'nokogiri'
5
+ require 'httparty'
6
+ require 'open-uri'
7
+ require 'fileutils'
8
+ require 'spreadsheet'
9
+
9
10
  class Crawler
10
11
  URL = "https://plus.amanaimages.com/items/search/"
11
12
 
12
- def run(keyword, destination)
13
+ def run(keyword, destination, max)
13
14
  unparsed_page = HTTParty.get("#{URL}/#{keyword}")
14
15
  parsed_page = Nokogiri::HTML(unparsed_page)
15
16
 
16
17
  pages = parsed_page.css("div.c-paginate__nums").css('a').last.text.to_i # tổng số page
17
18
  images_listings = parsed_page.css("div.p-search-result__body") # danh sách các thẻ div chứa image
19
+
20
+ # lấy tổng số image
21
+ total = parsed_page.css("h1.p-search-result__ttl").text.split(' ').first
22
+ total = total[11..(total.length - 1)].chop.chop.chop.sub(',', '').to_i
23
+ if max > total # nếu max lớn hơn total thì max = total => vẫn lấy hết
24
+ max = total
25
+ end
18
26
 
19
- images = getPaginationImages(images_listings, pages, keyword)
27
+ images = getPaginationImages(images_listings, pages, keyword, max)
20
28
  downloadImages(images, destination)
21
29
  writeToExcel(images, destination)
22
30
  end
23
31
 
24
- def getPaginationImages(images_listings, pages, keyword) # lấy tất cả image của các page cộng lại
32
+ def getPaginationImages(images_listings, pages, keyword, max) # lấy tất cả image của các page cộng lại
25
33
  images = Array.new
26
- i = 0
27
34
  curr_page = 1
35
+ curr_index = 1
28
36
  while curr_page <= pages
29
37
  puts "Crawling page #{curr_page}..........."
30
38
 
@@ -33,6 +41,10 @@ module TomosiaAmanaplusCrawl
33
41
  pagination_images_listings = pagination_parsed_page.css("div.p-item-thumb")
34
42
 
35
43
  pagination_images_listings.each do |img|
44
+ if curr_index > max
45
+ return images
46
+ end
47
+
36
48
  src = img.css('img').attr('data-src').nil? == true ? img.css('img').attr('src') : img.css('img').attr('data-src')
37
49
  current_image = {
38
50
  title: img.css('a')[1].attr('title'),
@@ -41,7 +53,7 @@ module TomosiaAmanaplusCrawl
41
53
  extension: ".#{src.to_s.split('.').last}"
42
54
  }
43
55
  images << current_image
44
- # puts "#{i += 1}: #{src}"
56
+ curr_index += 1
45
57
  end
46
58
 
47
59
  curr_page += 1
@@ -1,3 +1,3 @@
1
1
  module TomosiaAmanaplusCrawl
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.5"
3
3
  end
@@ -4,26 +4,18 @@ Gem::Specification.new do |spec|
4
4
  spec.name = "tomosia_amanaplus_crawl"
5
5
  spec.version = TomosiaAmanaplusCrawl::VERSION
6
6
  spec.authors = "Nhat Huy"
7
- spec.email = "nhathuych@tomosia.com"
8
7
 
9
8
  spec.summary = %q{tomosia_amanaplus_crawl demo project crawl du lieu.}
10
- spec.description = %q{tomosia_amanaplus_crawl demo project crawl du lieu.}
11
9
  spec.homepage = "https://github.com/tthuydang/tomosia_amanaplus_crawl"
12
- spec.license = "MIT"
13
10
  spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
14
11
 
15
- # spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
16
-
17
- # spec.metadata["homepage_uri"] = spec.homepage
18
- # spec.metadata["source_code_uri"] = "TODO: Put your gem's public repo URL here."
19
- # spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
20
-
21
- # Specify which files should be added to the gem when it is released.
22
- # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
23
12
  spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
24
13
  `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
25
14
  end
26
- spec.bindir = "exe"
27
15
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
28
16
  spec.require_paths = ["lib"]
17
+
18
+ spec.add_runtime_dependency('httparty', '0.18.1')
19
+ spec.add_runtime_dependency('nokogiri', '1.10.10')
20
+ spec.add_runtime_dependency('spreadsheet', '1.2.6')
29
21
  end
metadata CHANGED
@@ -1,17 +1,59 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tomosia_amanaplus_crawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nhat Huy
8
8
  autorequire:
9
- bindir: exe
9
+ bindir: bin
10
10
  cert_chain: []
11
- date: 2020-08-04 00:00:00.000000000 Z
12
- dependencies: []
13
- description: tomosia_amanaplus_crawl demo project crawl du lieu.
14
- email: nhathuych@tomosia.com
11
+ date: 2020-08-07 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: httparty
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '='
18
+ - !ruby/object:Gem::Version
19
+ version: 0.18.1
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '='
25
+ - !ruby/object:Gem::Version
26
+ version: 0.18.1
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '='
32
+ - !ruby/object:Gem::Version
33
+ version: 1.10.10
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '='
39
+ - !ruby/object:Gem::Version
40
+ version: 1.10.10
41
+ - !ruby/object:Gem::Dependency
42
+ name: spreadsheet
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '='
46
+ - !ruby/object:Gem::Version
47
+ version: 1.2.6
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '='
53
+ - !ruby/object:Gem::Version
54
+ version: 1.2.6
55
+ description:
56
+ email:
15
57
  executables: []
16
58
  extensions: []
17
59
  extra_rdoc_files: []
@@ -21,6 +63,7 @@ files:
21
63
  - ".travis.yml"
22
64
  - CODE_OF_CONDUCT.md
23
65
  - Gemfile
66
+ - Gemfile.lock
24
67
  - LICENSE.txt
25
68
  - README.md
26
69
  - Rakefile
@@ -30,8 +73,7 @@ files:
30
73
  - lib/tomosia_amanaplus_crawl/version.rb
31
74
  - tomosia_amanaplus_crawl.gemspec
32
75
  homepage: https://github.com/tthuydang/tomosia_amanaplus_crawl
33
- licenses:
34
- - MIT
76
+ licenses: []
35
77
  metadata: {}
36
78
  post_install_message:
37
79
  rdoc_options: []