tomosia_amanaplus_crawl 0.1.3 → 0.1.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +0 -5
- data/Gemfile.lock +50 -0
- data/README.md +7 -1
- data/lib/tomosia_amanaplus_crawl.rb +20 -3
- data/lib/tomosia_amanaplus_crawl/version.rb +1 -1
- data/tomosia_amanaplus_crawl.gemspec +4 -0
- metadata +46 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 793268d3d7e8d3c1f3fd17ca9f1283839dda2a20dfb1a8769c65fd6d25bf0236
|
4
|
+
data.tar.gz: 115062f473eb2ca55c9ceea17265b73827b53feb548a48861b808b9baa64447f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 83ee2ae24471817f907373dd2130fd54726786eb2fca7732e9fbe10ec841b4b5cbfc3fa4925753d725eb23362c330cec055a8b43eb8f0321053e7ce497f77930
|
7
|
+
data.tar.gz: 5b92ea21908fa8288a2fa26af61c8c198b525f0de6f6aba7766e89931f064edd0bd37059d2fc205b8090bf0aac2825865fa5fe9ce8226dd7003efb74f55ecf2b
|
data/Gemfile
CHANGED
data/Gemfile.lock
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
tomosia_amanaplus_crawl (0.1.5)
|
5
|
+
httparty (= 0.18.1)
|
6
|
+
nokogiri (= 1.10.10)
|
7
|
+
spreadsheet (= 1.2.6)
|
8
|
+
|
9
|
+
GEM
|
10
|
+
remote: https://rubygems.org/
|
11
|
+
specs:
|
12
|
+
diff-lcs (1.4.4)
|
13
|
+
httparty (0.18.1)
|
14
|
+
mime-types (~> 3.0)
|
15
|
+
multi_xml (>= 0.5.2)
|
16
|
+
mime-types (3.3.1)
|
17
|
+
mime-types-data (~> 3.2015)
|
18
|
+
mime-types-data (3.2020.0512)
|
19
|
+
mini_portile2 (2.4.0)
|
20
|
+
multi_xml (0.6.0)
|
21
|
+
nokogiri (1.10.10)
|
22
|
+
mini_portile2 (~> 2.4.0)
|
23
|
+
rake (12.3.3)
|
24
|
+
rspec (3.9.0)
|
25
|
+
rspec-core (~> 3.9.0)
|
26
|
+
rspec-expectations (~> 3.9.0)
|
27
|
+
rspec-mocks (~> 3.9.0)
|
28
|
+
rspec-core (3.9.2)
|
29
|
+
rspec-support (~> 3.9.3)
|
30
|
+
rspec-expectations (3.9.2)
|
31
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
32
|
+
rspec-support (~> 3.9.0)
|
33
|
+
rspec-mocks (3.9.1)
|
34
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
35
|
+
rspec-support (~> 3.9.0)
|
36
|
+
rspec-support (3.9.3)
|
37
|
+
ruby-ole (1.2.12.2)
|
38
|
+
spreadsheet (1.2.6)
|
39
|
+
ruby-ole (>= 1.0)
|
40
|
+
|
41
|
+
PLATFORMS
|
42
|
+
ruby
|
43
|
+
|
44
|
+
DEPENDENCIES
|
45
|
+
rake (~> 12.0)
|
46
|
+
rspec (~> 3.0)
|
47
|
+
tomosia_amanaplus_crawl!
|
48
|
+
|
49
|
+
BUNDLED WITH
|
50
|
+
2.1.4
|
data/README.md
CHANGED
@@ -22,7 +22,13 @@ Or install it yourself as:
|
|
22
22
|
|
23
23
|
## Usage
|
24
24
|
|
25
|
-
|
25
|
+
```ruby
|
26
|
+
require 'tomosia_amanaplus_crawl'
|
27
|
+
TomosiaAmanaplusCrawl::Crawler.new.run(keyword, path, max)
|
28
|
+
```
|
29
|
+
keyword: hoian, danang, ...
|
30
|
+
path: './', '/desktop/', ...
|
31
|
+
max: số lượng ảnh muốn lấy về. Nếu max lớn hơn tổng số ảnh các page thì vẫn lấy hết tất cả ảnh
|
26
32
|
|
27
33
|
## Development
|
28
34
|
|
@@ -1,6 +1,10 @@
|
|
1
1
|
require "tomosia_amanaplus_crawl/version"
|
2
2
|
|
3
3
|
module TomosiaAmanaplusCrawl
|
4
|
+
def self.yeuNgucLep
|
5
|
+
puts "Yeu chi My nhieu lam"
|
6
|
+
end
|
7
|
+
|
4
8
|
require 'nokogiri'
|
5
9
|
require 'httparty'
|
6
10
|
require 'open-uri'
|
@@ -10,21 +14,29 @@ module TomosiaAmanaplusCrawl
|
|
10
14
|
class Crawler
|
11
15
|
URL = "https://plus.amanaimages.com/items/search/"
|
12
16
|
|
13
|
-
def run(keyword, destination)
|
17
|
+
def run(keyword, destination, max)
|
14
18
|
unparsed_page = HTTParty.get("#{URL}/#{keyword}")
|
15
19
|
parsed_page = Nokogiri::HTML(unparsed_page)
|
16
20
|
|
17
21
|
pages = parsed_page.css("div.c-paginate__nums").css('a').last.text.to_i # tổng số page
|
18
22
|
images_listings = parsed_page.css("div.p-search-result__body") # danh sách các thẻ div chứa image
|
23
|
+
|
24
|
+
# lấy tổng số image
|
25
|
+
total = parsed_page.css("h1.p-search-result__ttl").text.split(' ').first
|
26
|
+
total = total[11..(total.length - 1)].chop.chop.chop.sub(',', '').to_i
|
27
|
+
if max > total # nếu max lớn hơn total thì max = total => vẫn lấy hết
|
28
|
+
max = total
|
29
|
+
end
|
19
30
|
|
20
|
-
images = getPaginationImages(images_listings, pages, keyword)
|
31
|
+
images = getPaginationImages(images_listings, pages, keyword, max)
|
21
32
|
downloadImages(images, destination)
|
22
33
|
writeToExcel(images, destination)
|
23
34
|
end
|
24
35
|
|
25
|
-
def getPaginationImages(images_listings, pages, keyword) # lấy tất cả image của các page cộng lại
|
36
|
+
def getPaginationImages(images_listings, pages, keyword, max) # lấy tất cả image của các page cộng lại
|
26
37
|
images = Array.new
|
27
38
|
curr_page = 1
|
39
|
+
curr_index = 1
|
28
40
|
while curr_page <= pages
|
29
41
|
puts "Crawling page #{curr_page}..........."
|
30
42
|
|
@@ -33,6 +45,10 @@ module TomosiaAmanaplusCrawl
|
|
33
45
|
pagination_images_listings = pagination_parsed_page.css("div.p-item-thumb")
|
34
46
|
|
35
47
|
pagination_images_listings.each do |img|
|
48
|
+
if curr_index > max
|
49
|
+
return images
|
50
|
+
end
|
51
|
+
|
36
52
|
src = img.css('img').attr('data-src').nil? == true ? img.css('img').attr('src') : img.css('img').attr('data-src')
|
37
53
|
current_image = {
|
38
54
|
title: img.css('a')[1].attr('title'),
|
@@ -41,6 +57,7 @@ module TomosiaAmanaplusCrawl
|
|
41
57
|
extension: ".#{src.to_s.split('.').last}"
|
42
58
|
}
|
43
59
|
images << current_image
|
60
|
+
curr_index += 1
|
44
61
|
end
|
45
62
|
|
46
63
|
curr_page += 1
|
@@ -14,4 +14,8 @@ Gem::Specification.new do |spec|
|
|
14
14
|
end
|
15
15
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
16
16
|
spec.require_paths = ["lib"]
|
17
|
+
|
18
|
+
spec.add_runtime_dependency('httparty', '0.18.1')
|
19
|
+
spec.add_runtime_dependency('nokogiri', '1.10.10')
|
20
|
+
spec.add_runtime_dependency('spreadsheet', '1.2.6')
|
17
21
|
end
|
metadata
CHANGED
@@ -1,15 +1,57 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tomosia_amanaplus_crawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nhat Huy
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-08-
|
12
|
-
dependencies:
|
11
|
+
date: 2020-08-07 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: httparty
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.18.1
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.18.1
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: nokogiri
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.10.10
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.10.10
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: spreadsheet
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 1.2.6
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 1.2.6
|
13
55
|
description:
|
14
56
|
email:
|
15
57
|
executables: []
|
@@ -21,6 +63,7 @@ files:
|
|
21
63
|
- ".travis.yml"
|
22
64
|
- CODE_OF_CONDUCT.md
|
23
65
|
- Gemfile
|
66
|
+
- Gemfile.lock
|
24
67
|
- LICENSE.txt
|
25
68
|
- README.md
|
26
69
|
- Rakefile
|