tomosia_amanaplus_crawl 0.1.3 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +0 -5
- data/Gemfile.lock +50 -0
- data/README.md +7 -1
- data/lib/tomosia_amanaplus_crawl.rb +20 -3
- data/lib/tomosia_amanaplus_crawl/version.rb +1 -1
- data/tomosia_amanaplus_crawl.gemspec +4 -0
- metadata +46 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 793268d3d7e8d3c1f3fd17ca9f1283839dda2a20dfb1a8769c65fd6d25bf0236
|
4
|
+
data.tar.gz: 115062f473eb2ca55c9ceea17265b73827b53feb548a48861b808b9baa64447f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 83ee2ae24471817f907373dd2130fd54726786eb2fca7732e9fbe10ec841b4b5cbfc3fa4925753d725eb23362c330cec055a8b43eb8f0321053e7ce497f77930
|
7
|
+
data.tar.gz: 5b92ea21908fa8288a2fa26af61c8c198b525f0de6f6aba7766e89931f064edd0bd37059d2fc205b8090bf0aac2825865fa5fe9ce8226dd7003efb74f55ecf2b
|
data/Gemfile
CHANGED
data/Gemfile.lock
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
tomosia_amanaplus_crawl (0.1.5)
|
5
|
+
httparty (= 0.18.1)
|
6
|
+
nokogiri (= 1.10.10)
|
7
|
+
spreadsheet (= 1.2.6)
|
8
|
+
|
9
|
+
GEM
|
10
|
+
remote: https://rubygems.org/
|
11
|
+
specs:
|
12
|
+
diff-lcs (1.4.4)
|
13
|
+
httparty (0.18.1)
|
14
|
+
mime-types (~> 3.0)
|
15
|
+
multi_xml (>= 0.5.2)
|
16
|
+
mime-types (3.3.1)
|
17
|
+
mime-types-data (~> 3.2015)
|
18
|
+
mime-types-data (3.2020.0512)
|
19
|
+
mini_portile2 (2.4.0)
|
20
|
+
multi_xml (0.6.0)
|
21
|
+
nokogiri (1.10.10)
|
22
|
+
mini_portile2 (~> 2.4.0)
|
23
|
+
rake (12.3.3)
|
24
|
+
rspec (3.9.0)
|
25
|
+
rspec-core (~> 3.9.0)
|
26
|
+
rspec-expectations (~> 3.9.0)
|
27
|
+
rspec-mocks (~> 3.9.0)
|
28
|
+
rspec-core (3.9.2)
|
29
|
+
rspec-support (~> 3.9.3)
|
30
|
+
rspec-expectations (3.9.2)
|
31
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
32
|
+
rspec-support (~> 3.9.0)
|
33
|
+
rspec-mocks (3.9.1)
|
34
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
35
|
+
rspec-support (~> 3.9.0)
|
36
|
+
rspec-support (3.9.3)
|
37
|
+
ruby-ole (1.2.12.2)
|
38
|
+
spreadsheet (1.2.6)
|
39
|
+
ruby-ole (>= 1.0)
|
40
|
+
|
41
|
+
PLATFORMS
|
42
|
+
ruby
|
43
|
+
|
44
|
+
DEPENDENCIES
|
45
|
+
rake (~> 12.0)
|
46
|
+
rspec (~> 3.0)
|
47
|
+
tomosia_amanaplus_crawl!
|
48
|
+
|
49
|
+
BUNDLED WITH
|
50
|
+
2.1.4
|
data/README.md
CHANGED
@@ -22,7 +22,13 @@ Or install it yourself as:
|
|
22
22
|
|
23
23
|
## Usage
|
24
24
|
|
25
|
-
|
25
|
+
```ruby
|
26
|
+
require 'tomosia_amanaplus_crawl'
|
27
|
+
TomosiaAmanaplusCrawl::Crawler.new.run(keyword, path, max)
|
28
|
+
```
|
29
|
+
keyword: hoian, danang, ...
|
30
|
+
path: './', '/desktop/', ...
|
31
|
+
max: số lượng ảnh muốn lấy về. Nếu max lớn hơn tổng số ảnh các page thì vẫn lấy hết tất cả ảnh
|
26
32
|
|
27
33
|
## Development
|
28
34
|
|
@@ -1,6 +1,10 @@
|
|
1
1
|
require "tomosia_amanaplus_crawl/version"
|
2
2
|
|
3
3
|
module TomosiaAmanaplusCrawl
|
4
|
+
def self.yeuNgucLep
|
5
|
+
puts "Yeu chi My nhieu lam"
|
6
|
+
end
|
7
|
+
|
4
8
|
require 'nokogiri'
|
5
9
|
require 'httparty'
|
6
10
|
require 'open-uri'
|
@@ -10,21 +14,29 @@ module TomosiaAmanaplusCrawl
|
|
10
14
|
class Crawler
|
11
15
|
URL = "https://plus.amanaimages.com/items/search/"
|
12
16
|
|
13
|
-
def run(keyword, destination)
|
17
|
+
def run(keyword, destination, max)
|
14
18
|
unparsed_page = HTTParty.get("#{URL}/#{keyword}")
|
15
19
|
parsed_page = Nokogiri::HTML(unparsed_page)
|
16
20
|
|
17
21
|
pages = parsed_page.css("div.c-paginate__nums").css('a').last.text.to_i # tổng số page
|
18
22
|
images_listings = parsed_page.css("div.p-search-result__body") # danh sách các thẻ div chứa image
|
23
|
+
|
24
|
+
# lấy tổng số image
|
25
|
+
total = parsed_page.css("h1.p-search-result__ttl").text.split(' ').first
|
26
|
+
total = total[11..(total.length - 1)].chop.chop.chop.sub(',', '').to_i
|
27
|
+
if max > total # nếu max lớn hơn total thì max = total => vẫn lấy hết
|
28
|
+
max = total
|
29
|
+
end
|
19
30
|
|
20
|
-
images = getPaginationImages(images_listings, pages, keyword)
|
31
|
+
images = getPaginationImages(images_listings, pages, keyword, max)
|
21
32
|
downloadImages(images, destination)
|
22
33
|
writeToExcel(images, destination)
|
23
34
|
end
|
24
35
|
|
25
|
-
def getPaginationImages(images_listings, pages, keyword) # lấy tất cả image của các page cộng lại
|
36
|
+
def getPaginationImages(images_listings, pages, keyword, max) # lấy tất cả image của các page cộng lại
|
26
37
|
images = Array.new
|
27
38
|
curr_page = 1
|
39
|
+
curr_index = 1
|
28
40
|
while curr_page <= pages
|
29
41
|
puts "Crawling page #{curr_page}..........."
|
30
42
|
|
@@ -33,6 +45,10 @@ module TomosiaAmanaplusCrawl
|
|
33
45
|
pagination_images_listings = pagination_parsed_page.css("div.p-item-thumb")
|
34
46
|
|
35
47
|
pagination_images_listings.each do |img|
|
48
|
+
if curr_index > max
|
49
|
+
return images
|
50
|
+
end
|
51
|
+
|
36
52
|
src = img.css('img').attr('data-src').nil? == true ? img.css('img').attr('src') : img.css('img').attr('data-src')
|
37
53
|
current_image = {
|
38
54
|
title: img.css('a')[1].attr('title'),
|
@@ -41,6 +57,7 @@ module TomosiaAmanaplusCrawl
|
|
41
57
|
extension: ".#{src.to_s.split('.').last}"
|
42
58
|
}
|
43
59
|
images << current_image
|
60
|
+
curr_index += 1
|
44
61
|
end
|
45
62
|
|
46
63
|
curr_page += 1
|
@@ -14,4 +14,8 @@ Gem::Specification.new do |spec|
|
|
14
14
|
end
|
15
15
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
16
16
|
spec.require_paths = ["lib"]
|
17
|
+
|
18
|
+
spec.add_runtime_dependency('httparty', '0.18.1')
|
19
|
+
spec.add_runtime_dependency('nokogiri', '1.10.10')
|
20
|
+
spec.add_runtime_dependency('spreadsheet', '1.2.6')
|
17
21
|
end
|
metadata
CHANGED
@@ -1,15 +1,57 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tomosia_amanaplus_crawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nhat Huy
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-08-
|
12
|
-
dependencies:
|
11
|
+
date: 2020-08-07 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: httparty
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.18.1
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.18.1
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: nokogiri
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.10.10
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.10.10
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: spreadsheet
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 1.2.6
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 1.2.6
|
13
55
|
description:
|
14
56
|
email:
|
15
57
|
executables: []
|
@@ -21,6 +63,7 @@ files:
|
|
21
63
|
- ".travis.yml"
|
22
64
|
- CODE_OF_CONDUCT.md
|
23
65
|
- Gemfile
|
66
|
+
- Gemfile.lock
|
24
67
|
- LICENSE.txt
|
25
68
|
- README.md
|
26
69
|
- Rakefile
|