tomosia_amanaplus_crawl 0.1.0 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +0 -4
- data/Gemfile.lock +50 -0
- data/README.md +10 -1
- data/lib/tomosia_amanaplus_crawl.rb +22 -10
- data/lib/tomosia_amanaplus_crawl/version.rb +1 -1
- data/tomosia_amanaplus_crawl.gemspec +4 -12
- metadata +50 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7e09b5361e7b84f880485d64480eca862a19dcbd32c0cf4a0bab3160c3b5ab11
|
4
|
+
data.tar.gz: fc69ac8dcb7e3ba4c74412bb4760ef2cd996831d572eb63f184624330d8789a8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8ef2cbc7fa812b63ff04e136dcc5ff479fede20b7489485b83ce5dc84c5642fe3eba846bfc8a14c335b7c9271acf4bbcf2a6e0d4e2e557302488359306e820f4
|
7
|
+
data.tar.gz: 414a5491c0227c9f43830a11384e90b55305b0cdcb14d1b963c60c36aa278b748631296ad4bee837ed05184c490df8f747a0cc239025c6bd12d831ddb3ecb704
|
data/Gemfile
CHANGED
data/Gemfile.lock
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
tomosia_amanaplus_crawl (0.1.5)
|
5
|
+
httparty (= 0.18.1)
|
6
|
+
nokogiri (= 1.10.10)
|
7
|
+
spreadsheet (= 1.2.6)
|
8
|
+
|
9
|
+
GEM
|
10
|
+
remote: https://rubygems.org/
|
11
|
+
specs:
|
12
|
+
diff-lcs (1.4.4)
|
13
|
+
httparty (0.18.1)
|
14
|
+
mime-types (~> 3.0)
|
15
|
+
multi_xml (>= 0.5.2)
|
16
|
+
mime-types (3.3.1)
|
17
|
+
mime-types-data (~> 3.2015)
|
18
|
+
mime-types-data (3.2020.0512)
|
19
|
+
mini_portile2 (2.4.0)
|
20
|
+
multi_xml (0.6.0)
|
21
|
+
nokogiri (1.10.10)
|
22
|
+
mini_portile2 (~> 2.4.0)
|
23
|
+
rake (12.3.3)
|
24
|
+
rspec (3.9.0)
|
25
|
+
rspec-core (~> 3.9.0)
|
26
|
+
rspec-expectations (~> 3.9.0)
|
27
|
+
rspec-mocks (~> 3.9.0)
|
28
|
+
rspec-core (3.9.2)
|
29
|
+
rspec-support (~> 3.9.3)
|
30
|
+
rspec-expectations (3.9.2)
|
31
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
32
|
+
rspec-support (~> 3.9.0)
|
33
|
+
rspec-mocks (3.9.1)
|
34
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
35
|
+
rspec-support (~> 3.9.0)
|
36
|
+
rspec-support (3.9.3)
|
37
|
+
ruby-ole (1.2.12.2)
|
38
|
+
spreadsheet (1.2.6)
|
39
|
+
ruby-ole (>= 1.0)
|
40
|
+
|
41
|
+
PLATFORMS
|
42
|
+
ruby
|
43
|
+
|
44
|
+
DEPENDENCIES
|
45
|
+
rake (~> 12.0)
|
46
|
+
rspec (~> 3.0)
|
47
|
+
tomosia_amanaplus_crawl!
|
48
|
+
|
49
|
+
BUNDLED WITH
|
50
|
+
2.1.4
|
data/README.md
CHANGED
@@ -10,6 +10,9 @@ Add this line to your application's Gemfile:
|
|
10
10
|
|
11
11
|
```ruby
|
12
12
|
gem 'tomosia_amanaplus_crawl'
|
13
|
+
gem 'httparty'
|
14
|
+
gem 'nokogiri'
|
15
|
+
gem 'spreadsheet'
|
13
16
|
```
|
14
17
|
|
15
18
|
And then execute:
|
@@ -22,7 +25,13 @@ Or install it yourself as:
|
|
22
25
|
|
23
26
|
## Usage
|
24
27
|
|
25
|
-
|
28
|
+
```ruby
|
29
|
+
require 'tomosia_amanaplus_crawl'
|
30
|
+
TomosiaAmanaplusCrawl::Crawler.new.run(keyword, path, max)
|
31
|
+
```
|
32
|
+
keyword: hoian, danang, ...
|
33
|
+
path: './', '/desktop/', ...
|
34
|
+
max: số lượng ảnh muốn lấy về. Nếu max lớn hơn tổng số ảnh các page thì vẫn lấy hết tất cả ảnh
|
26
35
|
|
27
36
|
## Development
|
28
37
|
|
@@ -1,30 +1,38 @@
|
|
1
1
|
require "tomosia_amanaplus_crawl/version"
|
2
|
-
require 'nokogiri'
|
3
|
-
require 'httparty'
|
4
|
-
require 'open-uri'
|
5
|
-
require 'fileutils'
|
6
|
-
require 'spreadsheet'
|
7
2
|
|
8
3
|
module TomosiaAmanaplusCrawl
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'httparty'
|
6
|
+
require 'open-uri'
|
7
|
+
require 'fileutils'
|
8
|
+
require 'spreadsheet'
|
9
|
+
|
9
10
|
class Crawler
|
10
11
|
URL = "https://plus.amanaimages.com/items/search/"
|
11
12
|
|
12
|
-
def run(keyword, destination)
|
13
|
+
def run(keyword, destination, max)
|
13
14
|
unparsed_page = HTTParty.get("#{URL}/#{keyword}")
|
14
15
|
parsed_page = Nokogiri::HTML(unparsed_page)
|
15
16
|
|
16
17
|
pages = parsed_page.css("div.c-paginate__nums").css('a').last.text.to_i # tổng số page
|
17
18
|
images_listings = parsed_page.css("div.p-search-result__body") # danh sách các thẻ div chứa image
|
19
|
+
|
20
|
+
# lấy tổng số image
|
21
|
+
total = parsed_page.css("h1.p-search-result__ttl").text.split(' ').first
|
22
|
+
total = total[11..(total.length - 1)].chop.chop.chop.sub(',', '').to_i
|
23
|
+
if max > total # nếu max lớn hơn total thì max = total => vẫn lấy hết
|
24
|
+
max = total
|
25
|
+
end
|
18
26
|
|
19
|
-
images = getPaginationImages(images_listings, pages, keyword)
|
27
|
+
images = getPaginationImages(images_listings, pages, keyword, max)
|
20
28
|
downloadImages(images, destination)
|
21
29
|
writeToExcel(images, destination)
|
22
30
|
end
|
23
31
|
|
24
|
-
def getPaginationImages(images_listings, pages, keyword) # lấy tất cả image của các page cộng lại
|
32
|
+
def getPaginationImages(images_listings, pages, keyword, max) # lấy tất cả image của các page cộng lại
|
25
33
|
images = Array.new
|
26
|
-
i = 0
|
27
34
|
curr_page = 1
|
35
|
+
curr_index = 1
|
28
36
|
while curr_page <= pages
|
29
37
|
puts "Crawling page #{curr_page}..........."
|
30
38
|
|
@@ -33,6 +41,10 @@ module TomosiaAmanaplusCrawl
|
|
33
41
|
pagination_images_listings = pagination_parsed_page.css("div.p-item-thumb")
|
34
42
|
|
35
43
|
pagination_images_listings.each do |img|
|
44
|
+
if curr_index > max
|
45
|
+
return images
|
46
|
+
end
|
47
|
+
|
36
48
|
src = img.css('img').attr('data-src').nil? == true ? img.css('img').attr('src') : img.css('img').attr('data-src')
|
37
49
|
current_image = {
|
38
50
|
title: img.css('a')[1].attr('title'),
|
@@ -41,7 +53,7 @@ module TomosiaAmanaplusCrawl
|
|
41
53
|
extension: ".#{src.to_s.split('.').last}"
|
42
54
|
}
|
43
55
|
images << current_image
|
44
|
-
|
56
|
+
curr_index += 1
|
45
57
|
end
|
46
58
|
|
47
59
|
curr_page += 1
|
@@ -4,26 +4,18 @@ Gem::Specification.new do |spec|
|
|
4
4
|
spec.name = "tomosia_amanaplus_crawl"
|
5
5
|
spec.version = TomosiaAmanaplusCrawl::VERSION
|
6
6
|
spec.authors = "Nhat Huy"
|
7
|
-
spec.email = "nhathuych@tomosia.com"
|
8
7
|
|
9
8
|
spec.summary = %q{tomosia_amanaplus_crawl demo project crawl du lieu.}
|
10
|
-
spec.description = %q{tomosia_amanaplus_crawl demo project crawl du lieu.}
|
11
9
|
spec.homepage = "https://github.com/tthuydang/tomosia_amanaplus_crawl"
|
12
|
-
spec.license = "MIT"
|
13
10
|
spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
|
14
11
|
|
15
|
-
# spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
|
16
|
-
|
17
|
-
# spec.metadata["homepage_uri"] = spec.homepage
|
18
|
-
# spec.metadata["source_code_uri"] = "TODO: Put your gem's public repo URL here."
|
19
|
-
# spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
|
20
|
-
|
21
|
-
# Specify which files should be added to the gem when it is released.
|
22
|
-
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
23
12
|
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
24
13
|
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
25
14
|
end
|
26
|
-
spec.bindir = "exe"
|
27
15
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
28
16
|
spec.require_paths = ["lib"]
|
17
|
+
|
18
|
+
spec.add_runtime_dependency('httparty', '0.18.1')
|
19
|
+
spec.add_runtime_dependency('nokogiri', '1.10.10')
|
20
|
+
spec.add_runtime_dependency('spreadsheet', '1.2.6')
|
29
21
|
end
|
metadata
CHANGED
@@ -1,17 +1,59 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tomosia_amanaplus_crawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nhat Huy
|
8
8
|
autorequire:
|
9
|
-
bindir:
|
9
|
+
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-08-
|
12
|
-
dependencies:
|
13
|
-
|
14
|
-
|
11
|
+
date: 2020-08-07 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: httparty
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.18.1
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.18.1
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: nokogiri
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.10.10
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.10.10
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: spreadsheet
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 1.2.6
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 1.2.6
|
55
|
+
description:
|
56
|
+
email:
|
15
57
|
executables: []
|
16
58
|
extensions: []
|
17
59
|
extra_rdoc_files: []
|
@@ -21,6 +63,7 @@ files:
|
|
21
63
|
- ".travis.yml"
|
22
64
|
- CODE_OF_CONDUCT.md
|
23
65
|
- Gemfile
|
66
|
+
- Gemfile.lock
|
24
67
|
- LICENSE.txt
|
25
68
|
- README.md
|
26
69
|
- Rakefile
|
@@ -30,8 +73,7 @@ files:
|
|
30
73
|
- lib/tomosia_amanaplus_crawl/version.rb
|
31
74
|
- tomosia_amanaplus_crawl.gemspec
|
32
75
|
homepage: https://github.com/tthuydang/tomosia_amanaplus_crawl
|
33
|
-
licenses:
|
34
|
-
- MIT
|
76
|
+
licenses: []
|
35
77
|
metadata: {}
|
36
78
|
post_install_message:
|
37
79
|
rdoc_options: []
|