tomosia_wallhere_crawl 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +3 -0
- data/Gemfile.lock +45 -0
- data/README.md +9 -1
- data/Rakefile +8 -0
- data/exe/tomosia_wallhere_crawl +4 -0
- data/lib/tomosia_wallhere_crawl.rb +64 -44
- data/lib/tomosia_wallhere_crawl/cli.rb +17 -0
- data/lib/tomosia_wallhere_crawl/version.rb +1 -1
- data/pkg/tomosia_wallhere_crawl-0.1.1.gem +0 -0
- data/tomosia_wallhere_crawl-0.1.0.gem +0 -0
- data/tomosia_wallhere_crawl.gemspec +11 -6
- metadata +94 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a6c61da5b1321f6aedfbcd3ba4dba8e5fb101ec4f36ca4e6c974e1f26ab01cf7
|
4
|
+
data.tar.gz: 8b3d0be435727d25a4773a02b57d8f063bba8bb7b1a550724306dedf3003de39
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6a5064bca8a988c533b110ed2b31c6f7b264030dfd4fd7ecc1766dec801096bc85c9bf374157b21bd08f2b9fee1ed9743ed7aeea845e8195b85ee30490d3280f
|
7
|
+
data.tar.gz: b07d631fa4d6c33806e91641688cda81f872ebdfa9a77fc7af830eed579aaf9e1315bdf7ce623680263fa531e94bdac0f7ba0af67597880b0e92363f5bd3cbd0
|
data/Gemfile
CHANGED
data/Gemfile.lock
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
tomosia_wallhere_crawl (0.1.1)
|
5
|
+
nokogiri
|
6
|
+
thor
|
7
|
+
writeexcel
|
8
|
+
|
9
|
+
GEM
|
10
|
+
remote: https://rubygems.org/
|
11
|
+
specs:
|
12
|
+
diff-lcs (1.4.4)
|
13
|
+
mini_portile2 (2.4.0)
|
14
|
+
nokogiri (1.10.10)
|
15
|
+
mini_portile2 (~> 2.4.0)
|
16
|
+
rake (12.3.3)
|
17
|
+
rspec (3.9.0)
|
18
|
+
rspec-core (~> 3.9.0)
|
19
|
+
rspec-expectations (~> 3.9.0)
|
20
|
+
rspec-mocks (~> 3.9.0)
|
21
|
+
rspec-core (3.9.2)
|
22
|
+
rspec-support (~> 3.9.3)
|
23
|
+
rspec-expectations (3.9.2)
|
24
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
25
|
+
rspec-support (~> 3.9.0)
|
26
|
+
rspec-mocks (3.9.1)
|
27
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
28
|
+
rspec-support (~> 3.9.0)
|
29
|
+
rspec-support (3.9.3)
|
30
|
+
thor (1.0.1)
|
31
|
+
writeexcel (1.0.5)
|
32
|
+
|
33
|
+
PLATFORMS
|
34
|
+
ruby
|
35
|
+
|
36
|
+
DEPENDENCIES
|
37
|
+
bundler
|
38
|
+
nokogiri (~> 1.10, >= 1.10.10)
|
39
|
+
rake (~> 12.0)
|
40
|
+
rspec (~> 3.0)
|
41
|
+
tomosia_wallhere_crawl!
|
42
|
+
writeexcel (~> 1.0, >= 1.0.5)
|
43
|
+
|
44
|
+
BUNDLED WITH
|
45
|
+
2.1.4
|
data/README.md
CHANGED
@@ -21,9 +21,17 @@ Or install it yourself as:
|
|
21
21
|
$ gem install tomosia_wallhere_crawl
|
22
22
|
|
23
23
|
## Usage
|
24
|
+
_NOTE: key = "tag" , destination: "url saving" , max :number image
|
24
25
|
|
25
|
-
|
26
|
+
_/ Crawl all image in tag!
|
27
|
+
tomosia_wallhere_crawl crawl "key" --destination " "
|
28
|
+
=> tomosia_wallhere_crawl crawl "aaa" --destination "/home/tung/Desktop/img/"
|
26
29
|
|
30
|
+
_/ Crawl all images in tag with number of images!
|
31
|
+
tomosia_wallhere_crawl crawl "key" --destination " " --max
|
32
|
+
=> tomosia_wallhere_crawl crawl "aaa" --destination "/home/tung/Desktop/img/" --max=100
|
33
|
+
|
34
|
+
File save info images of tag in destination
|
27
35
|
## Development
|
28
36
|
|
29
37
|
After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/Rakefile
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
require 'open-uri'
|
2
2
|
require 'nokogiri'
|
3
3
|
require 'writeexcel'
|
4
|
+
require 'thor'
|
4
5
|
module TomosiaWallhereCrawl
|
5
6
|
class CrawlImage
|
6
|
-
|
7
|
-
|
7
|
+
|
8
|
+
def savedata (data = {}, destination)
|
9
|
+
workbook = WriteExcel.new("#{destination}/InfoImage.xls")
|
8
10
|
worksheet = workbook.add_worksheet
|
9
11
|
data.each_with_index do |row, stt|
|
10
12
|
row.each do |key, value|
|
@@ -14,13 +16,14 @@ module TomosiaWallhereCrawl
|
|
14
16
|
worksheet.write_string(stt, 3, row['size'])
|
15
17
|
end
|
16
18
|
end
|
17
|
-
|
19
|
+
workbook.close
|
20
|
+
puts "Save successfully"
|
18
21
|
end
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
+
|
23
|
+
def crawldata(key,destination,max=nil)
|
22
24
|
sum = 0
|
23
|
-
|
25
|
+
index = 1
|
26
|
+
images = []
|
24
27
|
while sum != max do
|
25
28
|
# Open url
|
26
29
|
url = "https://wallhere.com/en/wallpapers?q=#{key}&page=#{index}"
|
@@ -28,45 +31,62 @@ module TomosiaWallhereCrawl
|
|
28
31
|
content = document.read
|
29
32
|
parsed_content = Nokogiri::HTML(content)
|
30
33
|
length = parsed_content.css('.item').to_a.length - 1
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
34
|
+
total_img = parsed_content.css('div.hub-totalinfo').text.split(' HD Wallpapers')[0].to_i
|
35
|
+
if max == nil || max > total_img
|
36
|
+
max = total_img
|
37
|
+
puts "This tag has #{total_img} pictures"
|
38
|
+
end
|
39
|
+
i = 0
|
35
40
|
for i in i..length
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
size = File.size("#{description}#{nameimg}")
|
47
|
-
s = "#{size} kb"
|
48
|
-
row = {'stt'=>i, 'name'=>n, 'url'=>ui, 'extension'=>ex, 'size'=>s}
|
49
|
-
data.push(row)
|
50
|
-
end
|
51
|
-
end
|
52
|
-
sum += 1
|
53
|
-
if max == sum
|
54
|
-
break
|
55
|
-
end
|
56
|
-
end
|
41
|
+
urlimg = parsed_content.css('.item').to_a[i].children.children.first.to_h['src']
|
42
|
+
images.push(urlimg)
|
43
|
+
|
44
|
+
print '.'
|
45
|
+
sum += 1
|
46
|
+
if max == sum
|
47
|
+
break
|
48
|
+
end
|
49
|
+
end
|
50
|
+
index += 1
|
57
51
|
end
|
58
|
-
|
52
|
+
download(images,destination)
|
53
|
+
end
|
54
|
+
|
55
|
+
def download(images,destination)
|
56
|
+
data = []
|
57
|
+
row = {}
|
58
|
+
thread = []
|
59
|
+
images.each do |img|
|
60
|
+
thread << Thread.new(img) do
|
61
|
+
timeout = 0
|
62
|
+
begin
|
63
|
+
open(img) do |image|
|
64
|
+
nameimg = File.basename(img,".jpg!s")
|
65
|
+
ui = img
|
66
|
+
ex = File.extname(img).delete('.!s')
|
67
|
+
size = ""
|
68
|
+
File.open("#{destination}#{nameimg}","wb") do |file|
|
69
|
+
file.write(image.read)
|
70
|
+
size = image.size
|
71
|
+
end
|
72
|
+
size = size.to_s + " bytes"
|
73
|
+
row = {"name"=>nameimg, "url"=>ui, "extension"=> ex, "size" => size}
|
74
|
+
data.push(row)
|
75
|
+
end
|
76
|
+
rescue => exception
|
77
|
+
if timeout < 3
|
78
|
+
timeout += 1
|
79
|
+
retry
|
80
|
+
else
|
81
|
+
next
|
82
|
+
end
|
83
|
+
end
|
59
84
|
end
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
crawldata(key,description,max)
|
66
|
-
end
|
67
|
-
end
|
68
|
-
mutithread.map(&:join)
|
69
|
-
end
|
85
|
+
end
|
86
|
+
thread.each {|t| t.join}
|
87
|
+
puts " "
|
88
|
+
puts "Download successfully"
|
89
|
+
savedata(data,destination)
|
70
90
|
end
|
71
91
|
end
|
72
|
-
|
92
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'thor'
|
2
|
+
require_relative '../tomosia_wallhere_crawl'
|
3
|
+
|
4
|
+
module TomosiaWallhereCrawl
|
5
|
+
class Cli < Thor
|
6
|
+
desc "crawl KEYWORD", "enter KEYWORD to search"
|
7
|
+
option :destination
|
8
|
+
option :max
|
9
|
+
def crawl(keyword)
|
10
|
+
if options[:max] == nil
|
11
|
+
TomosiaWallhereCrawl::CrawlImage.new.crawldata(keyword, options[:destination], options[:max])
|
12
|
+
else
|
13
|
+
TomosiaWallhereCrawl::CrawlImage.new.crawldata(keyword, options[:destination], options[:max].to_i)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
Binary file
|
Binary file
|
@@ -11,12 +11,17 @@ Gem::Specification.new do |spec|
|
|
11
11
|
spec.homepage = "https://github.com/nguyensontung183183/tomosia_wallhere_crawl.git"
|
12
12
|
spec.license = "MIT"
|
13
13
|
spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
|
14
|
-
|
15
|
-
|
16
|
-
spec.files =
|
17
|
-
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
18
|
-
end
|
14
|
+
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split("\n")
|
19
17
|
spec.bindir = "exe"
|
20
|
-
spec.executables =
|
18
|
+
spec.executables = ["tomosia_wallhere_crawl"]
|
21
19
|
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_runtime_dependency 'thor'
|
22
|
+
spec.add_runtime_dependency 'nokogiri'
|
23
|
+
spec.add_runtime_dependency 'writeexcel'
|
24
|
+
spec.add_development_dependency "bundler"
|
25
|
+
spec.add_development_dependency "rake"
|
26
|
+
spec.add_development_dependency "rspec"
|
22
27
|
end
|
metadata
CHANGED
@@ -1,31 +1,121 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tomosia_wallhere_crawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- nguyen son tung
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-08-
|
12
|
-
dependencies:
|
11
|
+
date: 2020-08-09 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: thor
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: nokogiri
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: writeexcel
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: bundler
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rake
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rspec
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
13
97
|
description: Write a longer description or delete this line.
|
14
98
|
email:
|
15
99
|
- nguyensontung18183
|
16
|
-
executables:
|
100
|
+
executables:
|
101
|
+
- tomosia_wallhere_crawl
|
17
102
|
extensions: []
|
18
103
|
extra_rdoc_files: []
|
19
104
|
files:
|
20
105
|
- CODE_OF_CONDUCT.md
|
21
106
|
- Gemfile
|
107
|
+
- Gemfile.lock
|
22
108
|
- LICENSE.txt
|
23
109
|
- README.md
|
24
110
|
- Rakefile
|
25
111
|
- bin/console
|
26
112
|
- bin/setup
|
113
|
+
- exe/tomosia_wallhere_crawl
|
27
114
|
- lib/tomosia_wallhere_crawl.rb
|
115
|
+
- lib/tomosia_wallhere_crawl/cli.rb
|
28
116
|
- lib/tomosia_wallhere_crawl/version.rb
|
117
|
+
- pkg/tomosia_wallhere_crawl-0.1.1.gem
|
118
|
+
- tomosia_wallhere_crawl-0.1.0.gem
|
29
119
|
- tomosia_wallhere_crawl.gemspec
|
30
120
|
homepage: https://github.com/nguyensontung183183/tomosia_wallhere_crawl.git
|
31
121
|
licenses:
|