tomosia_amanaplus_crawl 0.1.8 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +3 -9
- data/README.md +2 -2
- data/Rakefile +6 -0
- data/exe/tomosia_amanaplus_crawl +5 -0
- data/lib/tomosia_amanaplus_crawl.rb +28 -24
- data/lib/tomosia_amanaplus_crawl/cli.rb +14 -0
- data/lib/tomosia_amanaplus_crawl/version.rb +1 -1
- data/spec/spec_helper.rb +14 -0
- data/spec/tomosia_amanaplus_crawl_spec.rb +9 -0
- data/tomosia_amanaplus_crawl.gemspec +4 -5
- metadata +20 -15
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 78adb558a362c9594df8a864dad75bf4bc95472eb64e2aa9ef1499d2a39f6837
|
4
|
+
data.tar.gz: d2d670dbc023aa1f1ce265f78245b18a2177da373b0e4436a8a88ced97f9677b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 382535d1072a6803ffd0166ee70b99187514b9119656e94c852519ef3c608ed6d829a86d19d4d8a20027a6fc1e1a3fe909defc4644e139002998751a40bb124c
|
7
|
+
data.tar.gz: d18803def75f4efa16e4e4339e3f0b1e64a6a54f77c4a8a6b45d5cb8991e6a3e5a87a0c4d1047c22ad8fdab4266c1c3dedac07660663aeb6557624266e6c6809
|
data/Gemfile.lock
CHANGED
@@ -1,23 +1,16 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
tomosia_amanaplus_crawl (0.
|
5
|
-
httparty (= 0.18.1)
|
4
|
+
tomosia_amanaplus_crawl (0.2.3)
|
6
5
|
nokogiri (= 1.10.10)
|
7
6
|
spreadsheet (= 1.2.6)
|
7
|
+
thor
|
8
8
|
|
9
9
|
GEM
|
10
10
|
remote: https://rubygems.org/
|
11
11
|
specs:
|
12
12
|
diff-lcs (1.4.4)
|
13
|
-
httparty (0.18.1)
|
14
|
-
mime-types (~> 3.0)
|
15
|
-
multi_xml (>= 0.5.2)
|
16
|
-
mime-types (3.3.1)
|
17
|
-
mime-types-data (~> 3.2015)
|
18
|
-
mime-types-data (3.2020.0512)
|
19
13
|
mini_portile2 (2.4.0)
|
20
|
-
multi_xml (0.6.0)
|
21
14
|
nokogiri (1.10.10)
|
22
15
|
mini_portile2 (~> 2.4.0)
|
23
16
|
rake (12.3.3)
|
@@ -37,6 +30,7 @@ GEM
|
|
37
30
|
ruby-ole (1.2.12.2)
|
38
31
|
spreadsheet (1.2.6)
|
39
32
|
ruby-ole (>= 1.0)
|
33
|
+
thor (1.0.1)
|
40
34
|
|
41
35
|
PLATFORMS
|
42
36
|
ruby
|
data/README.md
CHANGED
@@ -23,9 +23,9 @@ Or install it yourself as:
|
|
23
23
|
## Usage
|
24
24
|
|
25
25
|
```ruby
|
26
|
-
|
27
|
-
TomosiaAmanaplusCrawl::Crawler.new.run(keyword, path, max)
|
26
|
+
tomosia_amanaplus_crawl crawl "keyword" --destination "/home/usr/Documents" --max=123
|
28
27
|
```
|
28
|
+
Example: tomosia_amanaplus_crawl crawl "hoian" --destination "./" --max=123
|
29
29
|
keyword: hoian, danang, ...
|
30
30
|
path: './', '/desktop/', ...
|
31
31
|
max: số lượng ảnh muốn lấy về. Nếu max lớn hơn tổng số ảnh các page thì vẫn lấy hết tất cả ảnh
|
data/Rakefile
CHANGED
@@ -1,30 +1,25 @@
|
|
1
1
|
require "tomosia_amanaplus_crawl/version"
|
2
2
|
|
3
3
|
module TomosiaAmanaplusCrawl
|
4
|
-
def self.yeuNgucLep
|
5
|
-
puts "Yeu chi My nhieu lam"
|
6
|
-
end
|
7
|
-
|
8
4
|
require 'nokogiri'
|
9
|
-
require 'httparty'
|
10
5
|
require 'open-uri'
|
11
6
|
require 'fileutils'
|
12
7
|
require 'spreadsheet'
|
13
8
|
|
14
9
|
class Crawler
|
15
|
-
URL = "https://plus.amanaimages.com/items/search
|
10
|
+
URL = "https://plus.amanaimages.com/items/search"
|
16
11
|
|
17
12
|
def run(keyword, destination, max)
|
18
|
-
unparsed_page =
|
13
|
+
unparsed_page = open("#{URL}/#{keyword}").read
|
19
14
|
parsed_page = Nokogiri::HTML(unparsed_page)
|
20
15
|
|
21
16
|
pages = parsed_page.css("div.c-paginate__nums").css('a').last.text.to_i # tổng số page
|
22
17
|
images_listings = parsed_page.css("div.p-search-result__body") # danh sách các thẻ div chứa image
|
23
|
-
|
18
|
+
|
24
19
|
# lấy tổng số image
|
25
20
|
total = parsed_page.css("h1.p-search-result__ttl").text.split(' ').first
|
26
|
-
total = total[
|
27
|
-
if max > total # nếu max lớn hơn total thì max = total => vẫn lấy hết
|
21
|
+
total = total[(6 + keyword.length)..(total.length - 1)].chop.chop.chop.gsub(',', '').to_i
|
22
|
+
if max == nil || max > total # nếu max lớn hơn total thì max = total => vẫn lấy hết
|
28
23
|
max = total
|
29
24
|
end
|
30
25
|
|
@@ -39,11 +34,11 @@ module TomosiaAmanaplusCrawl
|
|
39
34
|
curr_index = 1
|
40
35
|
while curr_page <= pages
|
41
36
|
puts "Crawling page #{curr_page}..........."
|
42
|
-
|
43
|
-
pagination_unparsed_page =
|
37
|
+
|
38
|
+
pagination_unparsed_page = open("#{URL}/#{keyword}?page=#{curr_page}").read
|
44
39
|
pagination_parsed_page = Nokogiri::HTML(pagination_unparsed_page)
|
45
40
|
pagination_images_listings = pagination_parsed_page.css("div.p-item-thumb")
|
46
|
-
|
41
|
+
|
47
42
|
pagination_images_listings.each do |img|
|
48
43
|
if curr_index > max
|
49
44
|
return images
|
@@ -59,7 +54,7 @@ module TomosiaAmanaplusCrawl
|
|
59
54
|
images << current_image
|
60
55
|
curr_index += 1
|
61
56
|
end
|
62
|
-
|
57
|
+
|
63
58
|
curr_page += 1
|
64
59
|
end
|
65
60
|
images
|
@@ -67,20 +62,30 @@ module TomosiaAmanaplusCrawl
|
|
67
62
|
|
68
63
|
# tải hình và cập nhật lại size
|
69
64
|
def downloadImages(images, destination)
|
70
|
-
path = "#{destination}
|
65
|
+
path = "#{destination}//Downloads" # lưu hình ở folder Downloads
|
71
66
|
Dir.mkdir path unless File.exist? path
|
72
67
|
|
73
68
|
threads = []
|
74
69
|
print "\nDownloading"
|
75
70
|
images.each do |curr_image|
|
76
71
|
threads << Thread.new(curr_image) {
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
curr_image[:
|
81
|
-
|
72
|
+
timeout = 0
|
73
|
+
begin
|
74
|
+
URI.open(curr_image[:url]) do |image|
|
75
|
+
File.open("#{path}/#{curr_image[:url].split('/').last}", "a+") do |file|
|
76
|
+
file.write(image.read) # lưu hình ảnh
|
77
|
+
curr_image[:size] = image.size # cập nhật lại size trong mảng images
|
78
|
+
print '.'
|
79
|
+
end
|
80
|
+
end # end open
|
81
|
+
rescue => exception
|
82
|
+
if timeout < 3
|
83
|
+
timeout += 1
|
84
|
+
retry
|
85
|
+
else
|
86
|
+
next
|
82
87
|
end
|
83
|
-
end
|
88
|
+
end
|
84
89
|
}
|
85
90
|
end
|
86
91
|
threads.each { |t| t.join }
|
@@ -88,16 +93,15 @@ module TomosiaAmanaplusCrawl
|
|
88
93
|
end
|
89
94
|
|
90
95
|
def writeToExcel(images, destination)
|
91
|
-
path = "#{destination}
|
96
|
+
path = "#{destination}//File Excel" # lưu file ở folder File Excel
|
92
97
|
Dir.mkdir path unless File.exist? path
|
93
98
|
|
94
99
|
book = Spreadsheet::Workbook.new
|
95
100
|
sheet1 = book.create_worksheet
|
96
101
|
|
97
|
-
i = 0
|
98
102
|
sheet1.row(0).concat %w{Title Url Size(bytes) Extension}
|
99
103
|
puts "Writing..........."
|
100
|
-
images.
|
104
|
+
images.each_with_index do |img, i|
|
101
105
|
sheet1.row(i += 1).push img[:title], img[:url], img[:size], img[:extension]
|
102
106
|
end
|
103
107
|
puts "Writed."
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'thor'
|
2
|
+
require_relative '../tomosia_amanaplus_crawl'
|
3
|
+
|
4
|
+
module TomosiaAmanaplusCrawl
|
5
|
+
class Cli < Thor
|
6
|
+
|
7
|
+
desc "crawl KEYWORD", "enter KEYWORD to search"
|
8
|
+
option :destination
|
9
|
+
option :max
|
10
|
+
def crawl(keyword)
|
11
|
+
TomosiaAmanaplusCrawl::Crawler.new.run(keyword, options[:destination], options[:max] == nil ? nil : options[:max].to_i)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
require "bundler/setup"
|
2
|
+
require "tomosia_amanaplus_crawl"
|
3
|
+
|
4
|
+
RSpec.configure do |config|
|
5
|
+
# Enable flags like --only-failures and --next-failure
|
6
|
+
config.example_status_persistence_file_path = ".rspec_status"
|
7
|
+
|
8
|
+
# Disable RSpec exposing methods globally on `Module` and `main`
|
9
|
+
config.disable_monkey_patching!
|
10
|
+
|
11
|
+
config.expect_with :rspec do |c|
|
12
|
+
c.syntax = :expect
|
13
|
+
end
|
14
|
+
end
|
@@ -9,13 +9,12 @@ Gem::Specification.new do |spec|
|
|
9
9
|
spec.homepage = "https://github.com/tthuydang/tomosia_amanaplus_crawl"
|
10
10
|
spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
|
11
11
|
|
12
|
-
spec.files =
|
13
|
-
|
14
|
-
|
15
|
-
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
12
|
+
spec.files = `git ls-files`.split("\n")
|
13
|
+
spec.bindir = "exe"
|
14
|
+
spec.executables = 'tomosia_amanaplus_crawl'
|
16
15
|
spec.require_paths = ["lib"]
|
17
16
|
|
18
|
-
spec.add_runtime_dependency('httparty', '0.18.1')
|
19
17
|
spec.add_runtime_dependency('nokogiri', '1.10.10')
|
20
18
|
spec.add_runtime_dependency('spreadsheet', '1.2.6')
|
19
|
+
spec.add_runtime_dependency('thor')
|
21
20
|
end
|
metadata
CHANGED
@@ -1,60 +1,61 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tomosia_amanaplus_crawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nhat Huy
|
8
8
|
autorequire:
|
9
|
-
bindir:
|
9
|
+
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-08-
|
11
|
+
date: 2020-08-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: nokogiri
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - '='
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 1.10.10
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - '='
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 1.10.10
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: spreadsheet
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - '='
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 1.
|
33
|
+
version: 1.2.6
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - '='
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: 1.
|
40
|
+
version: 1.2.6
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: thor
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- -
|
45
|
+
- - ">="
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
47
|
+
version: '0'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- -
|
52
|
+
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: '0'
|
55
55
|
description:
|
56
56
|
email:
|
57
|
-
executables:
|
57
|
+
executables:
|
58
|
+
- tomosia_amanaplus_crawl
|
58
59
|
extensions: []
|
59
60
|
extra_rdoc_files: []
|
60
61
|
files:
|
@@ -69,8 +70,12 @@ files:
|
|
69
70
|
- Rakefile
|
70
71
|
- bin/console
|
71
72
|
- bin/setup
|
73
|
+
- exe/tomosia_amanaplus_crawl
|
72
74
|
- lib/tomosia_amanaplus_crawl.rb
|
75
|
+
- lib/tomosia_amanaplus_crawl/cli.rb
|
73
76
|
- lib/tomosia_amanaplus_crawl/version.rb
|
77
|
+
- spec/spec_helper.rb
|
78
|
+
- spec/tomosia_amanaplus_crawl_spec.rb
|
74
79
|
- tomosia_amanaplus_crawl.gemspec
|
75
80
|
homepage: https://github.com/tthuydang/tomosia_amanaplus_crawl
|
76
81
|
licenses: []
|