tomosia_123rf_crawl 0.1.0 → 0.1.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1f78b6e2908eae0a4cbdb18a6004a0cdd1bf85665aeb8c3a56efc9fb66cd0434
4
- data.tar.gz: 7e2b5142e956f9978f9fabac6d32824378003f2a45e5131554560831c2f2ad3f
3
+ metadata.gz: d01cc2c7974a1e7504f55c1343e24e145eff9e953f2a58103db081ee41b4e4a3
4
+ data.tar.gz: 537d07742bb26609dff324b968eec2983a391ba25e318a84410392e34e76f02f
5
5
  SHA512:
6
- metadata.gz: f55f661cded1ff753eb0d13069266d9c81e9f3f5d8dc0ace6d9a9d5aacf7e287e3e81d3871de9fe4ca978ee1893c70d56f088eb80dc0fdcae012b78f0a3bffee
7
- data.tar.gz: 461112743f48a7e37e25e16beb4bef9c6251b5c442cd283e829fb543f9c9f9eac82bec80fb23244df25b7741f5ee8902905c3e02862ae32146c58ee71c11614f
6
+ metadata.gz: d49573e81fd948f6bb2464acb9e5b214d9ccab3e0e1b221ee8ea7767b4bc95d692f84b3ddd22259e1c33b6244faf4febd0f3d8fe30398e7e287ce2f91e364c73
7
+ data.tar.gz: 87f9360b304fa785a0fbcda0d42a78f73aa9cfc41e337491f8c0b54875d6c8ed354235ca147c460fe7a8acce402d8252d038f54bd9d5b48d21432448e72a8e48
data/Gemfile CHANGED
@@ -4,6 +4,3 @@ source "https://rubygems.org"
4
4
  gemspec
5
5
 
6
6
  gem "rake", "~> 12.0"
7
- gem 'httparty'
8
- gem 'nokogiri'
9
-
@@ -0,0 +1,37 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ tomosia_123rf_crawl (0.1.9)
5
+ httparty (= 0.18.1)
6
+ nokogiri (= 1.10.10)
7
+ spreadsheet (= 1.2.6)
8
+ thor
9
+
10
+ GEM
11
+ remote: https://rubygems.org/
12
+ specs:
13
+ httparty (0.18.1)
14
+ mime-types (~> 3.0)
15
+ multi_xml (>= 0.5.2)
16
+ mime-types (3.3.1)
17
+ mime-types-data (~> 3.2015)
18
+ mime-types-data (3.2020.0512)
19
+ mini_portile2 (2.4.0)
20
+ multi_xml (0.6.0)
21
+ nokogiri (1.10.10)
22
+ mini_portile2 (~> 2.4.0)
23
+ rake (12.3.3)
24
+ ruby-ole (1.2.12.2)
25
+ spreadsheet (1.2.6)
26
+ ruby-ole (>= 1.0)
27
+ thor (1.0.1)
28
+
29
+ PLATFORMS
30
+ ruby
31
+
32
+ DEPENDENCIES
33
+ rake (~> 12.0)
34
+ tomosia_123rf_crawl!
35
+
36
+ BUNDLED WITH
37
+ 2.1.4
data/Rakefile CHANGED
@@ -1,2 +1,12 @@
1
1
  require "bundler/gem_tasks"
2
+ # require "rspec/core/rake task"
3
+ # Rspec::Core::RakeTask.new(:spec)
4
+
2
5
  task :default => :spec
6
+
7
+ namespace :gem do
8
+ task :build do
9
+ system "rake build && rake install"
10
+ end
11
+ end
12
+
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env ruby
2
+ require 'tomosia_123rf_crawl/cli'
3
+ Tomosia123rfCrawl::Cli.start
@@ -1,11 +1,93 @@
1
1
  require "tomosia_123rf_crawl/version"
2
- require "tomosia_123rf_crawl/tomosia_123rf_crawl"
3
2
 
4
- module ActionView
5
- module Helpers
6
- module AssetTagHelper
7
- include Tomosia123rfCrawl
3
+ module Tomosia123rfCrawl
4
+ require 'nokogiri'
5
+ require 'open-uri'
6
+ require 'fileutils'
7
+ require 'spreadsheet'
8
+
9
+ class Crawler
10
+
11
+ #Hàm dùng để download các ảnh từ web về
12
+ def downloadImages(images, destination)
13
+ #Tạo forder dùng để lưu hình ảnh
14
+ path= "#{destination}/images"
15
+
16
+ #Tạo thư mục chỉ định của người dùng
17
+ Dir.mkdir path unless File.exist? path
18
+
19
+ threads = []
20
+ images.each do |curr_image|
21
+ threads << Thread.new(curr_image){
22
+ open(curr_image[:link]) do |image|
23
+ File.open("#{path}/".concat(curr_image[:link].split('/').last.to_s.split('?').first.to_s),"a+") do |file|
24
+ file.write(image.read)
25
+ curr_image[:size] = image.size.to_s
26
+ end
27
+ end
28
+
29
+ }
30
+ end
31
+ threads.each{|t| t.join}
8
32
  end
9
- end
10
- end
11
33
 
34
+ def run(keyword, destination, max)
35
+
36
+ url = "https://www.123rf.com/stock-photo/dog.html?start=1&sti=mwk3081ovjk7062i9c|"
37
+ unparsed_page = open("#{url}/#{keyword}").read
38
+ parsed_page = Nokogiri::HTML(unparsed_page)
39
+ jobs = Array.new
40
+
41
+ list_imgs = parsed_page.css('div.mosaic-main-container')
42
+ #list_imgs.count = 110 imges
43
+ #Ta sẽ crawl data từ web với số trang tối đa mà ta muốn crawl data
44
+
45
+ count_page = 1
46
+ #Tạo file dùng để ghi những thông tin của file
47
+ File.delete("images.xls") if File.file?("images.xls")
48
+ File.new("images.xls","a+")
49
+ File.open("images.xls","a") do |file|
50
+ file.write("NAME,URL,SIZE,EXTENSION")
51
+ file.write("\n")
52
+ end
53
+
54
+
55
+
56
+ while count_page <= max*110
57
+ pagination_url ="https://www.123rf.com/stock-photo/#{keyword}.html?start=#{count_page}&sti=mwk3081ovjk7062i9c|"
58
+ pagination_unparsed_page = open(pagination_url).read
59
+ pagination_parsed_page = Nokogiri::HTML( pagination_unparsed_page )
60
+ pagination_list_imgs = pagination_parsed_page.css('div.mosaic-main-container')
61
+ puts "Page :#{count_page}"
62
+ puts pagination_url
63
+ puts " ..............."
64
+
65
+
66
+ #Ghi những thông tin về ảnh mà ta muốn download được lưu vào trong mảng jobs
67
+ pagination_list_imgs.each do |img_list|
68
+ job={
69
+ name: img_list.css('img').attr('src').text.split('/').last.to_s,
70
+ link: img_list.css('img').attr('src').text,
71
+ size: 'nil',
72
+ extension: img_list.css('img').attr('src').text.split('.').last.to_s.split('?').first.to_s
73
+
74
+ }
75
+ jobs << job
76
+ end
77
+ downloadImages(jobs, destination)
78
+
79
+
80
+ count_page+=110
81
+ end
82
+
83
+ #Ghi những thông tin ảnh vào file excel mà ta đã download về được từ mảng jobs
84
+ jobs.each do |curr_image|
85
+ File.open("./images.xls","a+") do |file|
86
+ file.write(curr_image[:name].concat(",").concat(curr_image[:link]).concat(",").concat(curr_image[:size]).concat(",").concat(curr_image[:extension]))
87
+ file.write("\n")
88
+ end
89
+ end
90
+
91
+ end
92
+ end
93
+ end
@@ -0,0 +1,14 @@
1
+ require 'thor'
2
+ require_relative '../tomosia_123rf_crawl'
3
+
4
+ module Tomosia123rfCrawl
5
+ class Cli < Thor
6
+
7
+ desc "crawl KEYWORD", "enter KEYWORD to search"
8
+ option :destination
9
+ option :max
10
+ def crawl(keyword) # dung roi nay
11
+ Tomosia123rfCrawl::Crawler.new.run(keyword, options[:destination], options[:max].to_i)
12
+ end
13
+ end
14
+ end
@@ -1,3 +1,3 @@
1
1
  module Tomosia123rfCrawl
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.9"
3
3
  end
@@ -3,26 +3,29 @@ require_relative 'lib/tomosia_123rf_crawl/version'
3
3
  Gem::Specification.new do |spec|
4
4
  spec.name = "tomosia_123rf_crawl"
5
5
  spec.version = Tomosia123rfCrawl::VERSION
6
- spec.authors = "MinhTuong"
7
- spec.email = "totkitano@gmail.com"
8
- spec.summary = %q{gem complete date 7/8/2020}
9
- spec.description = %q{gem crawl data to web.}
10
- spec.homepage = "https://github.com/MinhTuongbk1210/Gem_tomosia_123rf_crawl"
11
- spec.license = "MIT"
6
+ spec.authors = ["Minh Tuong"]
7
+ spec.email = ["tt.tuong.tran@tomosia.com"]
8
+
9
+ spec.summary = %q{gem complete 7/8/2020.This is gem create}
10
+ spec.description = %q{This is gem cool gem.}
11
+ spec.homepage = "https://github.com/MinhTuongbk1210/gem_crawl_data"
12
12
  spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
13
13
 
14
14
  # spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
15
15
 
16
- # spec.metadata["homepage_uri"] = spec.homepage
16
+ spec.metadata["homepage_uri"] = spec.homepage
17
17
  # spec.metadata["source_code_uri"] = "TODO: Put your gem's public repo URL here."
18
18
  # spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
19
19
 
20
20
  # Specify which files should be added to the gem when it is released.
21
21
  # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
22
- spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
23
- `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
24
- end
22
+ spec.files = `git ls-files`.split("\n")
25
23
  spec.bindir = "exe"
26
- spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
24
+ spec.executables = ["tomosia_123rf_crawl"]
27
25
  spec.require_paths = ["lib"]
26
+ #spec.add_dependency 'thor'
27
+ spec.add_runtime_dependency('httparty', '0.18.1')
28
+ spec.add_runtime_dependency('nokogiri', '1.10.10')
29
+ spec.add_runtime_dependency('spreadsheet', '1.2.6')
30
+ spec.add_runtime_dependency('thor')
28
31
  end
metadata CHANGED
@@ -1,35 +1,96 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tomosia_123rf_crawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.9
5
5
  platform: ruby
6
6
  authors:
7
- - MinhTuong
7
+ - Minh Tuong
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-08-07 00:00:00.000000000 Z
12
- dependencies: []
13
- description: gem crawl data to web.
14
- email: totkitano@gmail.com
15
- executables: []
11
+ date: 2020-08-10 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: httparty
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '='
18
+ - !ruby/object:Gem::Version
19
+ version: 0.18.1
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '='
25
+ - !ruby/object:Gem::Version
26
+ version: 0.18.1
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '='
32
+ - !ruby/object:Gem::Version
33
+ version: 1.10.10
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '='
39
+ - !ruby/object:Gem::Version
40
+ version: 1.10.10
41
+ - !ruby/object:Gem::Dependency
42
+ name: spreadsheet
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '='
46
+ - !ruby/object:Gem::Version
47
+ version: 1.2.6
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '='
53
+ - !ruby/object:Gem::Version
54
+ version: 1.2.6
55
+ - !ruby/object:Gem::Dependency
56
+ name: thor
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description: This is gem cool gem.
70
+ email:
71
+ - tt.tuong.tran@tomosia.com
72
+ executables:
73
+ - tomosia_123rf_crawl
16
74
  extensions: []
17
75
  extra_rdoc_files: []
18
76
  files:
19
77
  - ".gitignore"
20
78
  - CODE_OF_CONDUCT.md
21
79
  - Gemfile
80
+ - Gemfile.lock
22
81
  - README.md
23
82
  - Rakefile
24
83
  - bin/console
25
84
  - bin/setup
85
+ - exe/tomosia_123rf_crawl
26
86
  - lib/tomosia_123rf_crawl.rb
87
+ - lib/tomosia_123rf_crawl/cli.rb
27
88
  - lib/tomosia_123rf_crawl/version.rb
28
89
  - tomosia_123rf_crawl.gemspec
29
- homepage: https://github.com/MinhTuongbk1210/Gem_tomosia_123rf_crawl
30
- licenses:
31
- - MIT
32
- metadata: {}
90
+ homepage: https://github.com/MinhTuongbk1210/gem_crawl_data
91
+ licenses: []
92
+ metadata:
93
+ homepage_uri: https://github.com/MinhTuongbk1210/gem_crawl_data
33
94
  post_install_message:
34
95
  rdoc_options: []
35
96
  require_paths:
@@ -48,5 +109,5 @@ requirements: []
48
109
  rubygems_version: 3.1.4
49
110
  signing_key:
50
111
  specification_version: 4
51
- summary: gem complete date 7/8/2020
112
+ summary: gem complete 7/8/2020.This is gem create
52
113
  test_files: []