tomosia_amanaplus_crawl 0.1.6 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +3 -1
- data/README.md +2 -2
- data/Rakefile +6 -0
- data/exe/tomosia_amanaplus_crawl +5 -0
- data/lib/tomosia_amanaplus_crawl.rb +22 -18
- data/lib/tomosia_amanaplus_crawl/cli.rb +14 -0
- data/lib/tomosia_amanaplus_crawl/version.rb +1 -1
- data/spec/spec_helper.rb +14 -0
- data/spec/tomosia_amanaplus_crawl_spec.rb +9 -0
- data/tomosia_amanaplus_crawl.gemspec +4 -5
- metadata +20 -15
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 43ec94c9458a2aaf5554bf3eac7728fd8fe4a7e3850a061636ef5bf73569965e
|
4
|
+
data.tar.gz: d532f0301d3862807918bd3f1785bf0770302702dfbc4d0d0240f1ac1397a9fa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d5f54508d8a41443488ae98f9ba966317ad84a001ce1abbc4ad12e5f8c01a6e939670c5900dfd4467e7a16bb215942c96a44c01103ec906f21d839a42f78337a
|
7
|
+
data.tar.gz: 4649b6ebed46e5aaddb54d3e2337e88bc4a98cd0399efbc2b4e49aaeaef088bb38eb29e592d60de6d6c7ad4ab090039558860514946f754de491f39427ae06ed
|
data/Gemfile.lock
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
tomosia_amanaplus_crawl (0.1.
|
4
|
+
tomosia_amanaplus_crawl (0.1.8)
|
5
5
|
httparty (= 0.18.1)
|
6
6
|
nokogiri (= 1.10.10)
|
7
7
|
spreadsheet (= 1.2.6)
|
8
|
+
thor
|
8
9
|
|
9
10
|
GEM
|
10
11
|
remote: https://rubygems.org/
|
@@ -37,6 +38,7 @@ GEM
|
|
37
38
|
ruby-ole (1.2.12.2)
|
38
39
|
spreadsheet (1.2.6)
|
39
40
|
ruby-ole (>= 1.0)
|
41
|
+
thor (1.0.1)
|
40
42
|
|
41
43
|
PLATFORMS
|
42
44
|
ruby
|
data/README.md
CHANGED
@@ -23,9 +23,9 @@ Or install it yourself as:
|
|
23
23
|
## Usage
|
24
24
|
|
25
25
|
```ruby
|
26
|
-
|
27
|
-
TomosiaAmanaplusCrawl::Crawler.new.run(keyword, path, max)
|
26
|
+
tomosia_amanaplus_crawl crawl "keyword" --destination "/home/usr/Documents" --max=123
|
28
27
|
```
|
28
|
+
Example: tomosia_amanaplus_crawl crawl "hoian" --destination "./" --max=123
|
29
29
|
keyword: hoian, danang, ...
|
30
30
|
path: './', '/desktop/', ...
|
31
31
|
max: số lượng ảnh muốn lấy về. Nếu max lớn hơn tổng số ảnh các page thì vẫn lấy hết tất cả ảnh
|
data/Rakefile
CHANGED
@@ -2,16 +2,15 @@ require "tomosia_amanaplus_crawl/version"
|
|
2
2
|
|
3
3
|
module TomosiaAmanaplusCrawl
|
4
4
|
require 'nokogiri'
|
5
|
-
require 'httparty'
|
6
5
|
require 'open-uri'
|
7
6
|
require 'fileutils'
|
8
7
|
require 'spreadsheet'
|
9
8
|
|
10
9
|
class Crawler
|
11
|
-
URL = "https://plus.amanaimages.com/items/search
|
10
|
+
URL = "https://plus.amanaimages.com/items/search"
|
12
11
|
|
13
12
|
def run(keyword, destination, max)
|
14
|
-
unparsed_page =
|
13
|
+
unparsed_page = open("#{URL}/#{keyword}").read
|
15
14
|
parsed_page = Nokogiri::HTML(unparsed_page)
|
16
15
|
|
17
16
|
pages = parsed_page.css("div.c-paginate__nums").css('a').last.text.to_i # tổng số page
|
@@ -19,7 +18,7 @@ module TomosiaAmanaplusCrawl
|
|
19
18
|
|
20
19
|
# lấy tổng số image
|
21
20
|
total = parsed_page.css("h1.p-search-result__ttl").text.split(' ').first
|
22
|
-
total = total[
|
21
|
+
total = total[(6 + keyword.length)..(total.length - 1)].chop.chop.chop.gsub(',', '').to_i
|
23
22
|
if max > total # nếu max lớn hơn total thì max = total => vẫn lấy hết
|
24
23
|
max = total
|
25
24
|
end
|
@@ -36,7 +35,7 @@ module TomosiaAmanaplusCrawl
|
|
36
35
|
while curr_page <= pages
|
37
36
|
puts "Crawling page #{curr_page}..........."
|
38
37
|
|
39
|
-
pagination_unparsed_page =
|
38
|
+
pagination_unparsed_page = open("#{URL}/#{keyword}?page=#{curr_page}").read
|
40
39
|
pagination_parsed_page = Nokogiri::HTML(pagination_unparsed_page)
|
41
40
|
pagination_images_listings = pagination_parsed_page.css("div.p-item-thumb")
|
42
41
|
|
@@ -63,20 +62,30 @@ module TomosiaAmanaplusCrawl
|
|
63
62
|
|
64
63
|
# tải hình và cập nhật lại size
|
65
64
|
def downloadImages(images, destination)
|
66
|
-
path = "#{destination}
|
65
|
+
path = "#{destination}//Downloads" # lưu hình ở folder Downloads
|
67
66
|
Dir.mkdir path unless File.exist? path
|
68
67
|
|
69
68
|
threads = []
|
70
69
|
print "\nDownloading"
|
71
70
|
images.each do |curr_image|
|
72
71
|
threads << Thread.new(curr_image) {
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
curr_image[:
|
77
|
-
|
72
|
+
timeout = 0
|
73
|
+
begin
|
74
|
+
URI.open(curr_image[:url]) do |image|
|
75
|
+
File.open("#{path}/#{curr_image[:url].split('/').last}", "a+") do |file|
|
76
|
+
file.write(image.read) # lưu hình ảnh
|
77
|
+
curr_image[:size] = image.size # cập nhật lại size trong mảng images
|
78
|
+
print "."
|
79
|
+
end
|
80
|
+
end # end open
|
81
|
+
rescue => exception
|
82
|
+
if timeout < 3
|
83
|
+
timeout += 1
|
84
|
+
retry
|
85
|
+
else
|
86
|
+
next
|
78
87
|
end
|
79
|
-
end
|
88
|
+
end
|
80
89
|
}
|
81
90
|
end
|
82
91
|
threads.each { |t| t.join }
|
@@ -84,7 +93,7 @@ module TomosiaAmanaplusCrawl
|
|
84
93
|
end
|
85
94
|
|
86
95
|
def writeToExcel(images, destination)
|
87
|
-
path = "#{destination}
|
96
|
+
path = "#{destination}//File Excel" # lưu file ở folder File Excel
|
88
97
|
Dir.mkdir path unless File.exist? path
|
89
98
|
|
90
99
|
book = Spreadsheet::Workbook.new
|
@@ -103,8 +112,3 @@ module TomosiaAmanaplusCrawl
|
|
103
112
|
|
104
113
|
end
|
105
114
|
end
|
106
|
-
|
107
|
-
def TomosiaAmanaplusCrawl(keyword = "Yêu", destination = "chị", max = "My")
|
108
|
-
puts "msg: #{keyword} #{destination} #{you}"
|
109
|
-
# TomosiaAmanaplusCrawl::Crawler.new.run(keyword, destination, max)
|
110
|
-
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'thor'
|
2
|
+
require_relative '../tomosia_amanaplus_crawl'
|
3
|
+
|
4
|
+
module TomosiaAmanaplusCrawl
|
5
|
+
class Cli < Thor
|
6
|
+
|
7
|
+
desc "crawl KEYWORD", "enter KEYWORD to search"
|
8
|
+
option :destination
|
9
|
+
option :max
|
10
|
+
def crawl(keyword)
|
11
|
+
TomosiaAmanaplusCrawl::Crawler.new.run(keyword, options[:destination], options[:max].to_i)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
require "bundler/setup"
|
2
|
+
require "tomosia_amanaplus_crawl"
|
3
|
+
|
4
|
+
RSpec.configure do |config|
|
5
|
+
# Enable flags like --only-failures and --next-failure
|
6
|
+
config.example_status_persistence_file_path = ".rspec_status"
|
7
|
+
|
8
|
+
# Disable RSpec exposing methods globally on `Module` and `main`
|
9
|
+
config.disable_monkey_patching!
|
10
|
+
|
11
|
+
config.expect_with :rspec do |c|
|
12
|
+
c.syntax = :expect
|
13
|
+
end
|
14
|
+
end
|
@@ -9,13 +9,12 @@ Gem::Specification.new do |spec|
|
|
9
9
|
spec.homepage = "https://github.com/tthuydang/tomosia_amanaplus_crawl"
|
10
10
|
spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
|
11
11
|
|
12
|
-
spec.files =
|
13
|
-
|
14
|
-
|
15
|
-
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
12
|
+
spec.files = `git ls-files`.split("\n")
|
13
|
+
spec.bindir = "exe"
|
14
|
+
spec.executables = 'tomosia_amanaplus_crawl'
|
16
15
|
spec.require_paths = ["lib"]
|
17
16
|
|
18
|
-
spec.add_runtime_dependency('httparty', '0.18.1')
|
19
17
|
spec.add_runtime_dependency('nokogiri', '1.10.10')
|
20
18
|
spec.add_runtime_dependency('spreadsheet', '1.2.6')
|
19
|
+
spec.add_runtime_dependency('thor')
|
21
20
|
end
|
metadata
CHANGED
@@ -1,60 +1,61 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tomosia_amanaplus_crawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nhat Huy
|
8
8
|
autorequire:
|
9
|
-
bindir:
|
9
|
+
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-08-
|
11
|
+
date: 2020-08-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: nokogiri
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - '='
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 1.10.10
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - '='
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 1.10.10
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: spreadsheet
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - '='
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 1.
|
33
|
+
version: 1.2.6
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - '='
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: 1.
|
40
|
+
version: 1.2.6
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: thor
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- -
|
45
|
+
- - ">="
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
47
|
+
version: '0'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- -
|
52
|
+
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: '0'
|
55
55
|
description:
|
56
56
|
email:
|
57
|
-
executables:
|
57
|
+
executables:
|
58
|
+
- tomosia_amanaplus_crawl
|
58
59
|
extensions: []
|
59
60
|
extra_rdoc_files: []
|
60
61
|
files:
|
@@ -69,8 +70,12 @@ files:
|
|
69
70
|
- Rakefile
|
70
71
|
- bin/console
|
71
72
|
- bin/setup
|
73
|
+
- exe/tomosia_amanaplus_crawl
|
72
74
|
- lib/tomosia_amanaplus_crawl.rb
|
75
|
+
- lib/tomosia_amanaplus_crawl/cli.rb
|
73
76
|
- lib/tomosia_amanaplus_crawl/version.rb
|
77
|
+
- spec/spec_helper.rb
|
78
|
+
- spec/tomosia_amanaplus_crawl_spec.rb
|
74
79
|
- tomosia_amanaplus_crawl.gemspec
|
75
80
|
homepage: https://github.com/tthuydang/tomosia_amanaplus_crawl
|
76
81
|
licenses: []
|