google_image_scraper 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 896f50afed6a74a3456062ed7110ffa03c51f53cba692e9662e6e5e0586a7644
4
+ data.tar.gz: ce7ae263ab5eb217d354f92fad2f61702b94fde058d4c914498edd402808782b
5
+ SHA512:
6
+ metadata.gz: 2b07827bb2a238ac55dc43578371c4c16e52755c5f46230c92fd6ee5466412f45e82b62c628b00a71d1d992759017a7128873ea1cdb43cb115831fde907a4b84
7
+ data.tar.gz: 54b982e88b162bc77cf0439943cbd73093968c8c5e753bf4c45987eac9731e91ec7b787961e2400c8cac8981cbd99b6d5968dabf09c713557c892936adb23893
@@ -0,0 +1,13 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+ /20*
10
+ /Gemfile.lock
11
+
12
+ # rspec failure tracking
13
+ .rspec_status
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
@@ -0,0 +1,10 @@
1
+ Style/StringLiterals:
2
+ Enabled: true
3
+ EnforcedStyle: double_quotes
4
+
5
+ Style/StringLiteralsInInterpolation:
6
+ Enabled: true
7
+ EnforcedStyle: double_quotes
8
+
9
+ Layout/LineLength:
10
+ Max: 120
data/Gemfile ADDED
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ source 'https://rubygems.org'
4
+
5
+ # Specify your gem's dependencies in google_image_scraper.gemspec
6
+ gemspec
7
+
8
+ gem 'rake', '~> 13.0'
9
+
10
+ gem 'rspec', '~> 3.0'
11
+
12
+ gem 'rubocop', '~> 0.80'
13
+
14
+ gem 'byebug'
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2021 Yudai Tanaka
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,38 @@
1
+ # GoogleImageScraper
2
+
3
+ A command line tool for downloadng image from Google Image Search.
4
+
5
+ ## Installation
6
+ ```
7
+ $ gem install google_image_scraper
8
+ ```
9
+
10
+ ## Usage
11
+ Execute `google_image_scaper` with `search_word`.
12
+ If you want to limit the amount of downloaded images, specify an integer as `max_number`.
13
+
14
+ ```
15
+ $ google_image_scraper search_word [max_number]
16
+ ```
17
+
18
+ After execution of this command, downloaded images will be saved at a directory named current date with YYYYMMDDHHMM format(e.g. 202001011023).
19
+
20
+ e.g.
21
+ ```
22
+ $ google_image_scraper cat
23
+ $ google_image_scraper cat 10
24
+ ```
25
+
26
+ ## Development
27
+
28
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
29
+
30
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
31
+
32
+ ## Contributing
33
+
34
+ Bug reports and pull requests are welcome on GitHub at https://github.com/ytnk531/google_image_scraper.
35
+
36
+ ## License
37
+
38
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rspec/core/rake_task"
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ require "rubocop/rake_task"
9
+
10
+ RuboCop::RakeTask.new
11
+
12
+ task default: %i[spec rubocop]
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "bundler/setup"
5
+ require "google_image_scraper"
6
+
7
+ # You can add fixtures and/or initialization code here to make experimenting
8
+ # with your gem easier. You can also use a different console, if you like.
9
+
10
+ # (If you use this, don't forget to add pry to your Gemfile!)
11
+ # require "pry"
12
+ # Pry.start
13
+
14
+ require "irb"
15
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'google_image_scraper'
4
+
5
+ keyword = ARGV[0]
6
+ limit = ARGV[1]&.to_i || nil
7
+ GoogleImageScraper::Scraper.new.scrape(keyword, limit)
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/google_image_scraper/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'google_image_scraper'
7
+ spec.version = GoogleImageScraper::VERSION
8
+ spec.authors = ['Yudai Tanaka']
9
+ spec.email = ['ytnk531@gmail.com']
10
+
11
+ spec.summary = 'An image downloader using Google image search.'
12
+ spec.description = 'An image downloader using Google image search. This is unofficial gem.'
13
+ spec.homepage = 'https://github.com/ytnk531/google_image_scraper'
14
+ spec.license = 'MIT'
15
+ spec.required_ruby_version = Gem::Requirement.new('>= 2.7.0')
16
+
17
+ # spec.metadata['allowed_push_host'] = "TODO: Set to 'http://mygemserver.com'"
18
+ spec.metadata['homepage_uri'] = spec.homepage
19
+ spec.metadata['source_code_uri'] = 'https://github.com/ytnk531/google_image_scraper'
20
+ # spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
21
+
22
+ # Specify which files should be added to the gem when it is released.
23
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
24
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
25
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
26
+ end
27
+ spec.bindir = 'exe'
28
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
29
+ spec.require_paths = ['lib']
30
+
31
+ spec.add_dependency 'selenium-webdriver', '~> 3.0'
32
+ end
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'google_image_scraper/version'
4
+ require_relative 'google_image_scraper/file_saver'
5
+ require_relative 'google_image_scraper/scraper'
6
+ require 'logger'
7
+
8
+ # Google image scraper
9
+ module GoogleImageScraper
10
+ Logger = ::Logger.new($stdout)
11
+
12
+ class Error < StandardError; end
13
+ end
@@ -0,0 +1,43 @@
1
+ require 'base64'
2
+ require 'digest'
3
+
4
+ # File saving and convert.
5
+ module GoogleImageScraper
6
+ class FileSaver
7
+ def save(src)
8
+ binary, extname = if src.start_with?('data:')
9
+ to_file_from_base64(src)
10
+ else
11
+ to_file_from_url(src)
12
+ end
13
+ fname = "#{Digest::MD5.hexdigest(src)[0...10]}.#{extname}"
14
+
15
+ File.write(File.join(dir.path, fname), binary)
16
+ end
17
+
18
+ private
19
+
20
+ def dir
21
+ return @dir if @dir
22
+
23
+ dirname = Time.new.strftime('%Y%m%d%H%M')
24
+ Dir.mkdir(dirname) unless Dir.exist? dirname
25
+ @dir = Dir.open(dirname)
26
+ end
27
+
28
+ def to_file_from_base64(src)
29
+ data_prefix, base64 = src.split(',')
30
+ [Base64.decode64(base64), "data.#{data_extname(data_prefix)}"]
31
+ end
32
+
33
+ def to_file_from_url(url)
34
+ meta = URI.open(url)
35
+ [meta.read, meta.content_type.split('/').last]
36
+ end
37
+
38
+ def data_extname(data_prefix)
39
+ %r{data:image/(?<extname>.+);base64} =~ data_prefix
40
+ extname
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,62 @@
1
+ require 'open-uri'
2
+ require 'selenium-webdriver'
3
+
4
+ module GoogleImageScraper
5
+ class Scraper
6
+ LARGE_PICTURE_XPATH = '/html/body/div[2]/c-wiz/div[3]' \
7
+ '/div[2]/div[3]/div/div/div[3]/div[2]/c-wiz/div[1]/div[1]/div/div[2]/a/img'.freeze
8
+ SEARCH_PAGE_URL_FORMAT = 'https://www.google.com/search?q=%s&source=lnms&tbm=isch'.freeze
9
+
10
+ def initialize
11
+ @driver = setup_driver
12
+ @file_saver = FileSaver.new
13
+ @wait = Selenium::WebDriver::Wait.new(timeout: 10)
14
+ end
15
+
16
+ def scrape(keyword, limit = nil)
17
+ @driver.get search_page_url(keyword)
18
+ img_end = limit&.-(1) || -1
19
+ start = 0
20
+
21
+ loop do
22
+ # After 2nd time of this execution, html_elements are increased than previous execution.
23
+ # This is because search page uses continuous loading triggered by scroll point.
24
+ elements = @driver.find_elements(css: '.mJxzWe img')
25
+ break if start == elements.size
26
+
27
+ download_images elements[start..img_end]
28
+
29
+ start = elements.size
30
+ elements.last.click unless limit
31
+ end
32
+ end
33
+
34
+ private
35
+
36
+ def search_page_url(keyword)
37
+ format(SEARCH_PAGE_URL_FORMAT, URI.encode_www_form_component(keyword))
38
+ end
39
+
40
+ def download_images(html_elements)
41
+ html_elements.each do |element|
42
+ # Show image in left side and then wait for large image will be displayed.
43
+ element.click
44
+
45
+ sleep 0.4
46
+ large_picture_element = @wait.until do
47
+ @driver.find_element(:xpath, LARGE_PICTURE_XPATH)
48
+ end
49
+ src = large_picture_element.attribute('src')
50
+ @file_saver.save(src)
51
+ end
52
+ end
53
+
54
+ def setup_driver
55
+ caps = Selenium::WebDriver::Remote::Capabilities.chrome(
56
+ 'chromeOptions' => { args: %w[--headless --disable-gpu window-size=1280x8000] }
57
+ )
58
+ # caps = Selenium::WebDriver::Remote::Capabilities.chrome
59
+ Selenium::WebDriver.for :chrome, desired_capabilities: caps
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module GoogleImageScraper
4
+ VERSION = "0.1.0"
5
+ end
metadata ADDED
@@ -0,0 +1,75 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: google_image_scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Yudai Tanaka
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2021-01-02 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: selenium-webdriver
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '3.0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '3.0'
27
+ description: An image downloader using Google image search. This is unofficial gem.
28
+ email:
29
+ - ytnk531@gmail.com
30
+ executables:
31
+ - google_image_scraper
32
+ extensions: []
33
+ extra_rdoc_files: []
34
+ files:
35
+ - ".gitignore"
36
+ - ".rspec"
37
+ - ".rubocop.yml"
38
+ - Gemfile
39
+ - LICENSE.txt
40
+ - README.md
41
+ - Rakefile
42
+ - bin/console
43
+ - bin/setup
44
+ - exe/google_image_scraper
45
+ - google_image_scraper.gemspec
46
+ - lib/google_image_scraper.rb
47
+ - lib/google_image_scraper/file_saver.rb
48
+ - lib/google_image_scraper/scraper.rb
49
+ - lib/google_image_scraper/version.rb
50
+ homepage: https://github.com/ytnk531/google_image_scraper
51
+ licenses:
52
+ - MIT
53
+ metadata:
54
+ homepage_uri: https://github.com/ytnk531/google_image_scraper
55
+ source_code_uri: https://github.com/ytnk531/google_image_scraper
56
+ post_install_message:
57
+ rdoc_options: []
58
+ require_paths:
59
+ - lib
60
+ required_ruby_version: !ruby/object:Gem::Requirement
61
+ requirements:
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ version: 2.7.0
65
+ required_rubygems_version: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ requirements: []
71
+ rubygems_version: 3.2.3
72
+ signing_key:
73
+ specification_version: 4
74
+ summary: An image downloader using Google image search.
75
+ test_files: []