tomosia_123rf_crawl 0.1.0 → 0.1.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +0 -3
- data/Gemfile.lock +37 -0
- data/Rakefile +10 -0
- data/exe/tomosia_123rf_crawl +3 -0
- data/lib/tomosia_123rf_crawl.rb +89 -7
- data/lib/tomosia_123rf_crawl/cli.rb +14 -0
- data/lib/tomosia_123rf_crawl/version.rb +1 -1
- data/tomosia_123rf_crawl.gemspec +14 -11
- metadata +73 -12
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d01cc2c7974a1e7504f55c1343e24e145eff9e953f2a58103db081ee41b4e4a3
|
4
|
+
data.tar.gz: 537d07742bb26609dff324b968eec2983a391ba25e318a84410392e34e76f02f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d49573e81fd948f6bb2464acb9e5b214d9ccab3e0e1b221ee8ea7767b4bc95d692f84b3ddd22259e1c33b6244faf4febd0f3d8fe30398e7e287ce2f91e364c73
|
7
|
+
data.tar.gz: 87f9360b304fa785a0fbcda0d42a78f73aa9cfc41e337491f8c0b54875d6c8ed354235ca147c460fe7a8acce402d8252d038f54bd9d5b48d21432448e72a8e48
|
data/Gemfile
CHANGED
data/Gemfile.lock
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
tomosia_123rf_crawl (0.1.9)
|
5
|
+
httparty (= 0.18.1)
|
6
|
+
nokogiri (= 1.10.10)
|
7
|
+
spreadsheet (= 1.2.6)
|
8
|
+
thor
|
9
|
+
|
10
|
+
GEM
|
11
|
+
remote: https://rubygems.org/
|
12
|
+
specs:
|
13
|
+
httparty (0.18.1)
|
14
|
+
mime-types (~> 3.0)
|
15
|
+
multi_xml (>= 0.5.2)
|
16
|
+
mime-types (3.3.1)
|
17
|
+
mime-types-data (~> 3.2015)
|
18
|
+
mime-types-data (3.2020.0512)
|
19
|
+
mini_portile2 (2.4.0)
|
20
|
+
multi_xml (0.6.0)
|
21
|
+
nokogiri (1.10.10)
|
22
|
+
mini_portile2 (~> 2.4.0)
|
23
|
+
rake (12.3.3)
|
24
|
+
ruby-ole (1.2.12.2)
|
25
|
+
spreadsheet (1.2.6)
|
26
|
+
ruby-ole (>= 1.0)
|
27
|
+
thor (1.0.1)
|
28
|
+
|
29
|
+
PLATFORMS
|
30
|
+
ruby
|
31
|
+
|
32
|
+
DEPENDENCIES
|
33
|
+
rake (~> 12.0)
|
34
|
+
tomosia_123rf_crawl!
|
35
|
+
|
36
|
+
BUNDLED WITH
|
37
|
+
2.1.4
|
data/Rakefile
CHANGED
data/lib/tomosia_123rf_crawl.rb
CHANGED
@@ -1,11 +1,93 @@
|
|
1
1
|
require "tomosia_123rf_crawl/version"
|
2
|
-
require "tomosia_123rf_crawl/tomosia_123rf_crawl"
|
3
2
|
|
4
|
-
module
|
5
|
-
|
6
|
-
|
7
|
-
|
3
|
+
module Tomosia123rfCrawl
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'open-uri'
|
6
|
+
require 'fileutils'
|
7
|
+
require 'spreadsheet'
|
8
|
+
|
9
|
+
class Crawler
|
10
|
+
|
11
|
+
#Hàm dùng để download các ảnh từ web về
|
12
|
+
def downloadImages(images, destination)
|
13
|
+
#Tạo forder dùng để lưu hình ảnh
|
14
|
+
path= "#{destination}/images"
|
15
|
+
|
16
|
+
#Tạo thư mục chỉ định của người dùng
|
17
|
+
Dir.mkdir path unless File.exist? path
|
18
|
+
|
19
|
+
threads = []
|
20
|
+
images.each do |curr_image|
|
21
|
+
threads << Thread.new(curr_image){
|
22
|
+
open(curr_image[:link]) do |image|
|
23
|
+
File.open("#{path}/".concat(curr_image[:link].split('/').last.to_s.split('?').first.to_s),"a+") do |file|
|
24
|
+
file.write(image.read)
|
25
|
+
curr_image[:size] = image.size.to_s
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
}
|
30
|
+
end
|
31
|
+
threads.each{|t| t.join}
|
8
32
|
end
|
9
|
-
end
|
10
|
-
end
|
11
33
|
|
34
|
+
def run(keyword, destination, max)
|
35
|
+
|
36
|
+
url = "https://www.123rf.com/stock-photo/dog.html?start=1&sti=mwk3081ovjk7062i9c|"
|
37
|
+
unparsed_page = open("#{url}/#{keyword}").read
|
38
|
+
parsed_page = Nokogiri::HTML(unparsed_page)
|
39
|
+
jobs = Array.new
|
40
|
+
|
41
|
+
list_imgs = parsed_page.css('div.mosaic-main-container')
|
42
|
+
#list_imgs.count = 110 imges
|
43
|
+
#Ta sẽ crawl data từ web với số trang tối đa mà ta muốn crawl data
|
44
|
+
|
45
|
+
count_page = 1
|
46
|
+
#Tạo file dùng để ghi những thông tin của file
|
47
|
+
File.delete("images.xls") if File.file?("images.xls")
|
48
|
+
File.new("images.xls","a+")
|
49
|
+
File.open("images.xls","a") do |file|
|
50
|
+
file.write("NAME,URL,SIZE,EXTENSION")
|
51
|
+
file.write("\n")
|
52
|
+
end
|
53
|
+
|
54
|
+
|
55
|
+
|
56
|
+
while count_page <= max*110
|
57
|
+
pagination_url ="https://www.123rf.com/stock-photo/#{keyword}.html?start=#{count_page}&sti=mwk3081ovjk7062i9c|"
|
58
|
+
pagination_unparsed_page = open(pagination_url).read
|
59
|
+
pagination_parsed_page = Nokogiri::HTML( pagination_unparsed_page )
|
60
|
+
pagination_list_imgs = pagination_parsed_page.css('div.mosaic-main-container')
|
61
|
+
puts "Page :#{count_page}"
|
62
|
+
puts pagination_url
|
63
|
+
puts " ..............."
|
64
|
+
|
65
|
+
|
66
|
+
#Ghi những thông tin về ảnh mà ta muốn download được lưu vào trong mảng jobs
|
67
|
+
pagination_list_imgs.each do |img_list|
|
68
|
+
job={
|
69
|
+
name: img_list.css('img').attr('src').text.split('/').last.to_s,
|
70
|
+
link: img_list.css('img').attr('src').text,
|
71
|
+
size: 'nil',
|
72
|
+
extension: img_list.css('img').attr('src').text.split('.').last.to_s.split('?').first.to_s
|
73
|
+
|
74
|
+
}
|
75
|
+
jobs << job
|
76
|
+
end
|
77
|
+
downloadImages(jobs, destination)
|
78
|
+
|
79
|
+
|
80
|
+
count_page+=110
|
81
|
+
end
|
82
|
+
|
83
|
+
#Ghi những thông tin ảnh vào file excel mà ta đã download về được từ mảng jobs
|
84
|
+
jobs.each do |curr_image|
|
85
|
+
File.open("./images.xls","a+") do |file|
|
86
|
+
file.write(curr_image[:name].concat(",").concat(curr_image[:link]).concat(",").concat(curr_image[:size]).concat(",").concat(curr_image[:extension]))
|
87
|
+
file.write("\n")
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'thor'
|
2
|
+
require_relative '../tomosia_123rf_crawl'
|
3
|
+
|
4
|
+
module Tomosia123rfCrawl
|
5
|
+
class Cli < Thor
|
6
|
+
|
7
|
+
desc "crawl KEYWORD", "enter KEYWORD to search"
|
8
|
+
option :destination
|
9
|
+
option :max
|
10
|
+
def crawl(keyword) # dung roi nay
|
11
|
+
Tomosia123rfCrawl::Crawler.new.run(keyword, options[:destination], options[:max].to_i)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
data/tomosia_123rf_crawl.gemspec
CHANGED
@@ -3,26 +3,29 @@ require_relative 'lib/tomosia_123rf_crawl/version'
|
|
3
3
|
Gem::Specification.new do |spec|
|
4
4
|
spec.name = "tomosia_123rf_crawl"
|
5
5
|
spec.version = Tomosia123rfCrawl::VERSION
|
6
|
-
spec.authors = "
|
7
|
-
spec.email = "
|
8
|
-
|
9
|
-
spec.
|
10
|
-
spec.
|
11
|
-
spec.
|
6
|
+
spec.authors = ["Minh Tuong"]
|
7
|
+
spec.email = ["tt.tuong.tran@tomosia.com"]
|
8
|
+
|
9
|
+
spec.summary = %q{gem complete 7/8/2020.This is gem create}
|
10
|
+
spec.description = %q{This is gem cool gem.}
|
11
|
+
spec.homepage = "https://github.com/MinhTuongbk1210/gem_crawl_data"
|
12
12
|
spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
|
13
13
|
|
14
14
|
# spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
|
15
15
|
|
16
|
-
|
16
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
17
17
|
# spec.metadata["source_code_uri"] = "TODO: Put your gem's public repo URL here."
|
18
18
|
# spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
|
19
19
|
|
20
20
|
# Specify which files should be added to the gem when it is released.
|
21
21
|
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
22
|
-
spec.files =
|
23
|
-
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
24
|
-
end
|
22
|
+
spec.files = `git ls-files`.split("\n")
|
25
23
|
spec.bindir = "exe"
|
26
|
-
spec.executables =
|
24
|
+
spec.executables = ["tomosia_123rf_crawl"]
|
27
25
|
spec.require_paths = ["lib"]
|
26
|
+
#spec.add_dependency 'thor'
|
27
|
+
spec.add_runtime_dependency('httparty', '0.18.1')
|
28
|
+
spec.add_runtime_dependency('nokogiri', '1.10.10')
|
29
|
+
spec.add_runtime_dependency('spreadsheet', '1.2.6')
|
30
|
+
spec.add_runtime_dependency('thor')
|
28
31
|
end
|
metadata
CHANGED
@@ -1,35 +1,96 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tomosia_123rf_crawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
|
-
-
|
7
|
+
- Minh Tuong
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-08-
|
12
|
-
dependencies:
|
13
|
-
|
14
|
-
|
15
|
-
|
11
|
+
date: 2020-08-10 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: httparty
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.18.1
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.18.1
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: nokogiri
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.10.10
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.10.10
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: spreadsheet
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 1.2.6
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 1.2.6
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: thor
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
description: This is gem cool gem.
|
70
|
+
email:
|
71
|
+
- tt.tuong.tran@tomosia.com
|
72
|
+
executables:
|
73
|
+
- tomosia_123rf_crawl
|
16
74
|
extensions: []
|
17
75
|
extra_rdoc_files: []
|
18
76
|
files:
|
19
77
|
- ".gitignore"
|
20
78
|
- CODE_OF_CONDUCT.md
|
21
79
|
- Gemfile
|
80
|
+
- Gemfile.lock
|
22
81
|
- README.md
|
23
82
|
- Rakefile
|
24
83
|
- bin/console
|
25
84
|
- bin/setup
|
85
|
+
- exe/tomosia_123rf_crawl
|
26
86
|
- lib/tomosia_123rf_crawl.rb
|
87
|
+
- lib/tomosia_123rf_crawl/cli.rb
|
27
88
|
- lib/tomosia_123rf_crawl/version.rb
|
28
89
|
- tomosia_123rf_crawl.gemspec
|
29
|
-
homepage: https://github.com/MinhTuongbk1210/
|
30
|
-
licenses:
|
31
|
-
|
32
|
-
|
90
|
+
homepage: https://github.com/MinhTuongbk1210/gem_crawl_data
|
91
|
+
licenses: []
|
92
|
+
metadata:
|
93
|
+
homepage_uri: https://github.com/MinhTuongbk1210/gem_crawl_data
|
33
94
|
post_install_message:
|
34
95
|
rdoc_options: []
|
35
96
|
require_paths:
|
@@ -48,5 +109,5 @@ requirements: []
|
|
48
109
|
rubygems_version: 3.1.4
|
49
110
|
signing_key:
|
50
111
|
specification_version: 4
|
51
|
-
summary: gem complete
|
112
|
+
summary: gem complete 7/8/2020.This is gem create
|
52
113
|
test_files: []
|