tomosia_123rf_crawl 0.1.0 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1f78b6e2908eae0a4cbdb18a6004a0cdd1bf85665aeb8c3a56efc9fb66cd0434
4
- data.tar.gz: 7e2b5142e956f9978f9fabac6d32824378003f2a45e5131554560831c2f2ad3f
3
+ metadata.gz: d01cc2c7974a1e7504f55c1343e24e145eff9e953f2a58103db081ee41b4e4a3
4
+ data.tar.gz: 537d07742bb26609dff324b968eec2983a391ba25e318a84410392e34e76f02f
5
5
  SHA512:
6
- metadata.gz: f55f661cded1ff753eb0d13069266d9c81e9f3f5d8dc0ace6d9a9d5aacf7e287e3e81d3871de9fe4ca978ee1893c70d56f088eb80dc0fdcae012b78f0a3bffee
7
- data.tar.gz: 461112743f48a7e37e25e16beb4bef9c6251b5c442cd283e829fb543f9c9f9eac82bec80fb23244df25b7741f5ee8902905c3e02862ae32146c58ee71c11614f
6
+ metadata.gz: d49573e81fd948f6bb2464acb9e5b214d9ccab3e0e1b221ee8ea7767b4bc95d692f84b3ddd22259e1c33b6244faf4febd0f3d8fe30398e7e287ce2f91e364c73
7
+ data.tar.gz: 87f9360b304fa785a0fbcda0d42a78f73aa9cfc41e337491f8c0b54875d6c8ed354235ca147c460fe7a8acce402d8252d038f54bd9d5b48d21432448e72a8e48
data/Gemfile CHANGED
@@ -4,6 +4,3 @@ source "https://rubygems.org"
4
4
  gemspec
5
5
 
6
6
  gem "rake", "~> 12.0"
7
- gem 'httparty'
8
- gem 'nokogiri'
9
-
@@ -0,0 +1,37 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ tomosia_123rf_crawl (0.1.9)
5
+ httparty (= 0.18.1)
6
+ nokogiri (= 1.10.10)
7
+ spreadsheet (= 1.2.6)
8
+ thor
9
+
10
+ GEM
11
+ remote: https://rubygems.org/
12
+ specs:
13
+ httparty (0.18.1)
14
+ mime-types (~> 3.0)
15
+ multi_xml (>= 0.5.2)
16
+ mime-types (3.3.1)
17
+ mime-types-data (~> 3.2015)
18
+ mime-types-data (3.2020.0512)
19
+ mini_portile2 (2.4.0)
20
+ multi_xml (0.6.0)
21
+ nokogiri (1.10.10)
22
+ mini_portile2 (~> 2.4.0)
23
+ rake (12.3.3)
24
+ ruby-ole (1.2.12.2)
25
+ spreadsheet (1.2.6)
26
+ ruby-ole (>= 1.0)
27
+ thor (1.0.1)
28
+
29
+ PLATFORMS
30
+ ruby
31
+
32
+ DEPENDENCIES
33
+ rake (~> 12.0)
34
+ tomosia_123rf_crawl!
35
+
36
+ BUNDLED WITH
37
+ 2.1.4
data/Rakefile CHANGED
@@ -1,2 +1,12 @@
1
1
  require "bundler/gem_tasks"
2
+ # require "rspec/core/rake task"
3
+ # Rspec::Core::RakeTask.new(:spec)
4
+
2
5
  task :default => :spec
6
+
7
+ namespace :gem do
8
+ task :build do
9
+ system "rake build && rake install"
10
+ end
11
+ end
12
+
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env ruby
2
+ require 'tomosia_123rf_crawl/cli'
3
+ Tomosia123rfCrawl::Cli.start
@@ -1,11 +1,93 @@
1
1
  require "tomosia_123rf_crawl/version"
2
- require "tomosia_123rf_crawl/tomosia_123rf_crawl"
3
2
 
4
- module ActionView
5
- module Helpers
6
- module AssetTagHelper
7
- include Tomosia123rfCrawl
3
+ module Tomosia123rfCrawl
4
+ require 'nokogiri'
5
+ require 'open-uri'
6
+ require 'fileutils'
7
+ require 'spreadsheet'
8
+
9
+ class Crawler
10
+
11
+ #Hàm dùng để download các ảnh từ web về
12
+ def downloadImages(images, destination)
13
+ #Tạo forder dùng để lưu hình ảnh
14
+ path= "#{destination}/images"
15
+
16
+ #Tạo thư mục chỉ định của người dùng
17
+ Dir.mkdir path unless File.exist? path
18
+
19
+ threads = []
20
+ images.each do |curr_image|
21
+ threads << Thread.new(curr_image){
22
+ open(curr_image[:link]) do |image|
23
+ File.open("#{path}/".concat(curr_image[:link].split('/').last.to_s.split('?').first.to_s),"a+") do |file|
24
+ file.write(image.read)
25
+ curr_image[:size] = image.size.to_s
26
+ end
27
+ end
28
+
29
+ }
30
+ end
31
+ threads.each{|t| t.join}
8
32
  end
9
- end
10
- end
11
33
 
34
+ def run(keyword, destination, max)
35
+
36
+ url = "https://www.123rf.com/stock-photo/dog.html?start=1&sti=mwk3081ovjk7062i9c|"
37
+ unparsed_page = open("#{url}/#{keyword}").read
38
+ parsed_page = Nokogiri::HTML(unparsed_page)
39
+ jobs = Array.new
40
+
41
+ list_imgs = parsed_page.css('div.mosaic-main-container')
42
+ #list_imgs.count = 110 imges
43
+ #Ta sẽ crawl data từ web với số trang tối đa mà ta muốn crawl data
44
+
45
+ count_page = 1
46
+ #Tạo file dùng để ghi những thông tin của file
47
+ File.delete("images.xls") if File.file?("images.xls")
48
+ File.new("images.xls","a+")
49
+ File.open("images.xls","a") do |file|
50
+ file.write("NAME,URL,SIZE,EXTENSION")
51
+ file.write("\n")
52
+ end
53
+
54
+
55
+
56
+ while count_page <= max*110
57
+ pagination_url ="https://www.123rf.com/stock-photo/#{keyword}.html?start=#{count_page}&sti=mwk3081ovjk7062i9c|"
58
+ pagination_unparsed_page = open(pagination_url).read
59
+ pagination_parsed_page = Nokogiri::HTML( pagination_unparsed_page )
60
+ pagination_list_imgs = pagination_parsed_page.css('div.mosaic-main-container')
61
+ puts "Page :#{count_page}"
62
+ puts pagination_url
63
+ puts " ..............."
64
+
65
+
66
+ #Ghi những thông tin về ảnh mà ta muốn download được lưu vào trong mảng jobs
67
+ pagination_list_imgs.each do |img_list|
68
+ job={
69
+ name: img_list.css('img').attr('src').text.split('/').last.to_s,
70
+ link: img_list.css('img').attr('src').text,
71
+ size: 'nil',
72
+ extension: img_list.css('img').attr('src').text.split('.').last.to_s.split('?').first.to_s
73
+
74
+ }
75
+ jobs << job
76
+ end
77
+ downloadImages(jobs, destination)
78
+
79
+
80
+ count_page+=110
81
+ end
82
+
83
+ #Ghi những thông tin ảnh vào file excel mà ta đã download về được từ mảng jobs
84
+ jobs.each do |curr_image|
85
+ File.open("./images.xls","a+") do |file|
86
+ file.write(curr_image[:name].concat(",").concat(curr_image[:link]).concat(",").concat(curr_image[:size]).concat(",").concat(curr_image[:extension]))
87
+ file.write("\n")
88
+ end
89
+ end
90
+
91
+ end
92
+ end
93
+ end
@@ -0,0 +1,14 @@
1
+ require 'thor'
2
+ require_relative '../tomosia_123rf_crawl'
3
+
4
+ module Tomosia123rfCrawl
5
+ class Cli < Thor
6
+
7
+ desc "crawl KEYWORD", "enter KEYWORD to search"
8
+ option :destination
9
+ option :max
10
+ def crawl(keyword) # dung roi nay
11
+ Tomosia123rfCrawl::Crawler.new.run(keyword, options[:destination], options[:max].to_i)
12
+ end
13
+ end
14
+ end
@@ -1,3 +1,3 @@
1
1
  module Tomosia123rfCrawl
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.9"
3
3
  end
@@ -3,26 +3,29 @@ require_relative 'lib/tomosia_123rf_crawl/version'
3
3
  Gem::Specification.new do |spec|
4
4
  spec.name = "tomosia_123rf_crawl"
5
5
  spec.version = Tomosia123rfCrawl::VERSION
6
- spec.authors = "MinhTuong"
7
- spec.email = "totkitano@gmail.com"
8
- spec.summary = %q{gem complete date 7/8/2020}
9
- spec.description = %q{gem crawl data to web.}
10
- spec.homepage = "https://github.com/MinhTuongbk1210/Gem_tomosia_123rf_crawl"
11
- spec.license = "MIT"
6
+ spec.authors = ["Minh Tuong"]
7
+ spec.email = ["tt.tuong.tran@tomosia.com"]
8
+
9
+ spec.summary = %q{gem complete 7/8/2020.This is gem create}
10
+ spec.description = %q{This is gem cool gem.}
11
+ spec.homepage = "https://github.com/MinhTuongbk1210/gem_crawl_data"
12
12
  spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
13
13
 
14
14
  # spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
15
15
 
16
- # spec.metadata["homepage_uri"] = spec.homepage
16
+ spec.metadata["homepage_uri"] = spec.homepage
17
17
  # spec.metadata["source_code_uri"] = "TODO: Put your gem's public repo URL here."
18
18
  # spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
19
19
 
20
20
  # Specify which files should be added to the gem when it is released.
21
21
  # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
22
- spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
23
- `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
24
- end
22
+ spec.files = `git ls-files`.split("\n")
25
23
  spec.bindir = "exe"
26
- spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
24
+ spec.executables = ["tomosia_123rf_crawl"]
27
25
  spec.require_paths = ["lib"]
26
+ #spec.add_dependency 'thor'
27
+ spec.add_runtime_dependency('httparty', '0.18.1')
28
+ spec.add_runtime_dependency('nokogiri', '1.10.10')
29
+ spec.add_runtime_dependency('spreadsheet', '1.2.6')
30
+ spec.add_runtime_dependency('thor')
28
31
  end
metadata CHANGED
@@ -1,35 +1,96 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tomosia_123rf_crawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.9
5
5
  platform: ruby
6
6
  authors:
7
- - MinhTuong
7
+ - Minh Tuong
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-08-07 00:00:00.000000000 Z
12
- dependencies: []
13
- description: gem crawl data to web.
14
- email: totkitano@gmail.com
15
- executables: []
11
+ date: 2020-08-10 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: httparty
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '='
18
+ - !ruby/object:Gem::Version
19
+ version: 0.18.1
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '='
25
+ - !ruby/object:Gem::Version
26
+ version: 0.18.1
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '='
32
+ - !ruby/object:Gem::Version
33
+ version: 1.10.10
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '='
39
+ - !ruby/object:Gem::Version
40
+ version: 1.10.10
41
+ - !ruby/object:Gem::Dependency
42
+ name: spreadsheet
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '='
46
+ - !ruby/object:Gem::Version
47
+ version: 1.2.6
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '='
53
+ - !ruby/object:Gem::Version
54
+ version: 1.2.6
55
+ - !ruby/object:Gem::Dependency
56
+ name: thor
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description: This is gem cool gem.
70
+ email:
71
+ - tt.tuong.tran@tomosia.com
72
+ executables:
73
+ - tomosia_123rf_crawl
16
74
  extensions: []
17
75
  extra_rdoc_files: []
18
76
  files:
19
77
  - ".gitignore"
20
78
  - CODE_OF_CONDUCT.md
21
79
  - Gemfile
80
+ - Gemfile.lock
22
81
  - README.md
23
82
  - Rakefile
24
83
  - bin/console
25
84
  - bin/setup
85
+ - exe/tomosia_123rf_crawl
26
86
  - lib/tomosia_123rf_crawl.rb
87
+ - lib/tomosia_123rf_crawl/cli.rb
27
88
  - lib/tomosia_123rf_crawl/version.rb
28
89
  - tomosia_123rf_crawl.gemspec
29
- homepage: https://github.com/MinhTuongbk1210/Gem_tomosia_123rf_crawl
30
- licenses:
31
- - MIT
32
- metadata: {}
90
+ homepage: https://github.com/MinhTuongbk1210/gem_crawl_data
91
+ licenses: []
92
+ metadata:
93
+ homepage_uri: https://github.com/MinhTuongbk1210/gem_crawl_data
33
94
  post_install_message:
34
95
  rdoc_options: []
35
96
  require_paths:
@@ -48,5 +109,5 @@ requirements: []
48
109
  rubygems_version: 3.1.4
49
110
  signing_key:
50
111
  specification_version: 4
51
- summary: gem complete date 7/8/2020
112
+ summary: gem complete 7/8/2020.This is gem create
52
113
  test_files: []