tomosia_amanaplus_crawl 0.1.8 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 793268d3d7e8d3c1f3fd17ca9f1283839dda2a20dfb1a8769c65fd6d25bf0236
4
- data.tar.gz: 115062f473eb2ca55c9ceea17265b73827b53feb548a48861b808b9baa64447f
3
+ metadata.gz: 78adb558a362c9594df8a864dad75bf4bc95472eb64e2aa9ef1499d2a39f6837
4
+ data.tar.gz: d2d670dbc023aa1f1ce265f78245b18a2177da373b0e4436a8a88ced97f9677b
5
5
  SHA512:
6
- metadata.gz: 83ee2ae24471817f907373dd2130fd54726786eb2fca7732e9fbe10ec841b4b5cbfc3fa4925753d725eb23362c330cec055a8b43eb8f0321053e7ce497f77930
7
- data.tar.gz: 5b92ea21908fa8288a2fa26af61c8c198b525f0de6f6aba7766e89931f064edd0bd37059d2fc205b8090bf0aac2825865fa5fe9ce8226dd7003efb74f55ecf2b
6
+ metadata.gz: 382535d1072a6803ffd0166ee70b99187514b9119656e94c852519ef3c608ed6d829a86d19d4d8a20027a6fc1e1a3fe909defc4644e139002998751a40bb124c
7
+ data.tar.gz: d18803def75f4efa16e4e4339e3f0b1e64a6a54f77c4a8a6b45d5cb8991e6a3e5a87a0c4d1047c22ad8fdab4266c1c3dedac07660663aeb6557624266e6c6809
@@ -1,23 +1,16 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- tomosia_amanaplus_crawl (0.1.5)
5
- httparty (= 0.18.1)
4
+ tomosia_amanaplus_crawl (0.2.3)
6
5
  nokogiri (= 1.10.10)
7
6
  spreadsheet (= 1.2.6)
7
+ thor
8
8
 
9
9
  GEM
10
10
  remote: https://rubygems.org/
11
11
  specs:
12
12
  diff-lcs (1.4.4)
13
- httparty (0.18.1)
14
- mime-types (~> 3.0)
15
- multi_xml (>= 0.5.2)
16
- mime-types (3.3.1)
17
- mime-types-data (~> 3.2015)
18
- mime-types-data (3.2020.0512)
19
13
  mini_portile2 (2.4.0)
20
- multi_xml (0.6.0)
21
14
  nokogiri (1.10.10)
22
15
  mini_portile2 (~> 2.4.0)
23
16
  rake (12.3.3)
@@ -37,6 +30,7 @@ GEM
37
30
  ruby-ole (1.2.12.2)
38
31
  spreadsheet (1.2.6)
39
32
  ruby-ole (>= 1.0)
33
+ thor (1.0.1)
40
34
 
41
35
  PLATFORMS
42
36
  ruby
data/README.md CHANGED
@@ -23,9 +23,9 @@ Or install it yourself as:
23
23
  ## Usage
24
24
 
25
25
  ```ruby
26
- require 'tomosia_amanaplus_crawl'
27
- TomosiaAmanaplusCrawl::Crawler.new.run(keyword, path, max)
26
+ tomosia_amanaplus_crawl crawl "keyword" --destination "/home/usr/Documents" --max=123
28
27
  ```
28
+ Example: tomosia_amanaplus_crawl crawl "hoian" --destination "./" --max=123
29
29
  keyword: hoian, danang, ...
30
30
  path: './', '/desktop/', ...
31
31
  max: số lượng ảnh muốn lấy về. Nếu max lớn hơn tổng số ảnh các page thì vẫn lấy hết tất cả ảnh
data/Rakefile CHANGED
@@ -4,3 +4,9 @@ require "rspec/core/rake_task"
4
4
  RSpec::Core::RakeTask.new(:spec)
5
5
 
6
6
  task :default => :spec
7
+
8
+ namespace :gem do
9
+ task :build do
10
+ system "rake build && rake install"
11
+ end
12
+ end
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'tomosia_amanaplus_crawl/cli'
4
+
5
+ TomosiaAmanaplusCrawl::Cli.start
@@ -1,30 +1,25 @@
1
1
  require "tomosia_amanaplus_crawl/version"
2
2
 
3
3
  module TomosiaAmanaplusCrawl
4
- def self.yeuNgucLep
5
- puts "Yeu chi My nhieu lam"
6
- end
7
-
8
4
  require 'nokogiri'
9
- require 'httparty'
10
5
  require 'open-uri'
11
6
  require 'fileutils'
12
7
  require 'spreadsheet'
13
8
 
14
9
  class Crawler
15
- URL = "https://plus.amanaimages.com/items/search/"
10
+ URL = "https://plus.amanaimages.com/items/search"
16
11
 
17
12
  def run(keyword, destination, max)
18
- unparsed_page = HTTParty.get("#{URL}/#{keyword}")
13
+ unparsed_page = open("#{URL}/#{keyword}").read
19
14
  parsed_page = Nokogiri::HTML(unparsed_page)
20
15
 
21
16
  pages = parsed_page.css("div.c-paginate__nums").css('a').last.text.to_i # tổng số page
22
17
  images_listings = parsed_page.css("div.p-search-result__body") # danh sách các thẻ div chứa image
23
-
18
+
24
19
  # lấy tổng số image
25
20
  total = parsed_page.css("h1.p-search-result__ttl").text.split(' ').first
26
- total = total[11..(total.length - 1)].chop.chop.chop.sub(',', '').to_i
27
- if max > total # nếu max lớn hơn total thì max = total => vẫn lấy hết
21
+ total = total[(6 + keyword.length)..(total.length - 1)].chop.chop.chop.gsub(',', '').to_i
22
+ if max == nil || max > total # nếu max lớn hơn total thì max = total => vẫn lấy hết
28
23
  max = total
29
24
  end
30
25
 
@@ -39,11 +34,11 @@ module TomosiaAmanaplusCrawl
39
34
  curr_index = 1
40
35
  while curr_page <= pages
41
36
  puts "Crawling page #{curr_page}..........."
42
-
43
- pagination_unparsed_page = HTTParty.get("https://plus.amanaimages.com/items/search/#{keyword}?page=#{curr_page}")
37
+
38
+ pagination_unparsed_page = open("#{URL}/#{keyword}?page=#{curr_page}").read
44
39
  pagination_parsed_page = Nokogiri::HTML(pagination_unparsed_page)
45
40
  pagination_images_listings = pagination_parsed_page.css("div.p-item-thumb")
46
-
41
+
47
42
  pagination_images_listings.each do |img|
48
43
  if curr_index > max
49
44
  return images
@@ -59,7 +54,7 @@ module TomosiaAmanaplusCrawl
59
54
  images << current_image
60
55
  curr_index += 1
61
56
  end
62
-
57
+
63
58
  curr_page += 1
64
59
  end
65
60
  images
@@ -67,20 +62,30 @@ module TomosiaAmanaplusCrawl
67
62
 
68
63
  # tải hình và cập nhật lại size
69
64
  def downloadImages(images, destination)
70
- path = "#{destination}/Downloads" # lưu hình ở folder Downloads
65
+ path = "#{destination}//Downloads" # lưu hình ở folder Downloads
71
66
  Dir.mkdir path unless File.exist? path
72
67
 
73
68
  threads = []
74
69
  print "\nDownloading"
75
70
  images.each do |curr_image|
76
71
  threads << Thread.new(curr_image) {
77
- open(curr_image[:url]) do |image|
78
- File.open("#{path}/#{curr_image[:url].split('/').last}", "a+") do |file|
79
- file.write(image.read) # lưu hình ảnh
80
- curr_image[:size] = image.size # cập nhật lại size trong mảng images
81
- print "."
72
+ timeout = 0
73
+ begin
74
+ URI.open(curr_image[:url]) do |image|
75
+ File.open("#{path}/#{curr_image[:url].split('/').last}", "a+") do |file|
76
+ file.write(image.read) # lưu hình ảnh
77
+ curr_image[:size] = image.size # cập nhật lại size trong mảng images
78
+ print '.'
79
+ end
80
+ end # end open
81
+ rescue => exception
82
+ if timeout < 3
83
+ timeout += 1
84
+ retry
85
+ else
86
+ next
82
87
  end
83
- end # end open
88
+ end
84
89
  }
85
90
  end
86
91
  threads.each { |t| t.join }
@@ -88,16 +93,15 @@ module TomosiaAmanaplusCrawl
88
93
  end
89
94
 
90
95
  def writeToExcel(images, destination)
91
- path = "#{destination}/File Excel" # lưu file ở folder File Excel
96
+ path = "#{destination}//File Excel" # lưu file ở folder File Excel
92
97
  Dir.mkdir path unless File.exist? path
93
98
 
94
99
  book = Spreadsheet::Workbook.new
95
100
  sheet1 = book.create_worksheet
96
101
 
97
- i = 0
98
102
  sheet1.row(0).concat %w{Title Url Size(bytes) Extension}
99
103
  puts "Writing..........."
100
- images.each do |img|
104
+ images.each_with_index do |img, i|
101
105
  sheet1.row(i += 1).push img[:title], img[:url], img[:size], img[:extension]
102
106
  end
103
107
  puts "Writed."
@@ -0,0 +1,14 @@
1
+ require 'thor'
2
+ require_relative '../tomosia_amanaplus_crawl'
3
+
4
+ module TomosiaAmanaplusCrawl
5
+ class Cli < Thor
6
+
7
+ desc "crawl KEYWORD", "enter KEYWORD to search"
8
+ option :destination
9
+ option :max
10
+ def crawl(keyword)
11
+ TomosiaAmanaplusCrawl::Crawler.new.run(keyword, options[:destination], options[:max] == nil ? nil : options[:max].to_i)
12
+ end
13
+ end
14
+ end
@@ -1,3 +1,3 @@
1
1
  module TomosiaAmanaplusCrawl
2
- VERSION = "0.1.8"
2
+ VERSION = "0.2.4"
3
3
  end
@@ -0,0 +1,14 @@
1
+ require "bundler/setup"
2
+ require "tomosia_amanaplus_crawl"
3
+
4
+ RSpec.configure do |config|
5
+ # Enable flags like --only-failures and --next-failure
6
+ config.example_status_persistence_file_path = ".rspec_status"
7
+
8
+ # Disable RSpec exposing methods globally on `Module` and `main`
9
+ config.disable_monkey_patching!
10
+
11
+ config.expect_with :rspec do |c|
12
+ c.syntax = :expect
13
+ end
14
+ end
@@ -0,0 +1,9 @@
1
+ RSpec.describe TomosiaAmanaplusCrawl do
2
+ it "has a version number" do
3
+ expect(TomosiaAmanaplusCrawl::VERSION).not_to be nil
4
+ end
5
+
6
+ it "does something useful" do
7
+ expect(false).to eq(true)
8
+ end
9
+ end
@@ -9,13 +9,12 @@ Gem::Specification.new do |spec|
9
9
  spec.homepage = "https://github.com/tthuydang/tomosia_amanaplus_crawl"
10
10
  spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
11
11
 
12
- spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
13
- `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
14
- end
15
- spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
12
+ spec.files = `git ls-files`.split("\n")
13
+ spec.bindir = "exe"
14
+ spec.executables = 'tomosia_amanaplus_crawl'
16
15
  spec.require_paths = ["lib"]
17
16
 
18
- spec.add_runtime_dependency('httparty', '0.18.1')
19
17
  spec.add_runtime_dependency('nokogiri', '1.10.10')
20
18
  spec.add_runtime_dependency('spreadsheet', '1.2.6')
19
+ spec.add_runtime_dependency('thor')
21
20
  end
metadata CHANGED
@@ -1,60 +1,61 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tomosia_amanaplus_crawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.8
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nhat Huy
8
8
  autorequire:
9
- bindir: bin
9
+ bindir: exe
10
10
  cert_chain: []
11
- date: 2020-08-07 00:00:00.000000000 Z
11
+ date: 2020-08-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: httparty
14
+ name: nokogiri
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - '='
18
18
  - !ruby/object:Gem::Version
19
- version: 0.18.1
19
+ version: 1.10.10
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - '='
25
25
  - !ruby/object:Gem::Version
26
- version: 0.18.1
26
+ version: 1.10.10
27
27
  - !ruby/object:Gem::Dependency
28
- name: nokogiri
28
+ name: spreadsheet
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - '='
32
32
  - !ruby/object:Gem::Version
33
- version: 1.10.10
33
+ version: 1.2.6
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - '='
39
39
  - !ruby/object:Gem::Version
40
- version: 1.10.10
40
+ version: 1.2.6
41
41
  - !ruby/object:Gem::Dependency
42
- name: spreadsheet
42
+ name: thor
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - '='
45
+ - - ">="
46
46
  - !ruby/object:Gem::Version
47
- version: 1.2.6
47
+ version: '0'
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - '='
52
+ - - ">="
53
53
  - !ruby/object:Gem::Version
54
- version: 1.2.6
54
+ version: '0'
55
55
  description:
56
56
  email:
57
- executables: []
57
+ executables:
58
+ - tomosia_amanaplus_crawl
58
59
  extensions: []
59
60
  extra_rdoc_files: []
60
61
  files:
@@ -69,8 +70,12 @@ files:
69
70
  - Rakefile
70
71
  - bin/console
71
72
  - bin/setup
73
+ - exe/tomosia_amanaplus_crawl
72
74
  - lib/tomosia_amanaplus_crawl.rb
75
+ - lib/tomosia_amanaplus_crawl/cli.rb
73
76
  - lib/tomosia_amanaplus_crawl/version.rb
77
+ - spec/spec_helper.rb
78
+ - spec/tomosia_amanaplus_crawl_spec.rb
74
79
  - tomosia_amanaplus_crawl.gemspec
75
80
  homepage: https://github.com/tthuydang/tomosia_amanaplus_crawl
76
81
  licenses: []