instagram-crawler 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 27b1cbc3574a62d01153ab0dd3854eadd933347f9bd10c32488c7b5500b55d4a
4
+ data.tar.gz: 3aa03c9d8c81dd633fc1f57bd3cce5a268bfaf8d0faa54aa0b7af7329aed0123
5
+ SHA512:
6
+ metadata.gz: 64dce2e2e24e0a80b79e1213acc46535e006044ee5902dbd06068d718908c9b96917bb15e9566592192b9e609b60f8bdb7de5e3801f52496b72242df70d773a2
7
+ data.tar.gz: d9ac14681c7731f1d4a6c75bd387b540f68492ed7f1e8c87161e1abdfaf402aef5041835caed0c1d544da73892d512066346f80b35dcf1c8fdcf5bbadfcd12cd
data/.gitignore ADDED
@@ -0,0 +1,11 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+
10
+ # rspec failure tracking
11
+ .rspec_status
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.travis.yml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ sudo: false
3
+ language: ruby
4
+ cache: bundler
5
+ rvm:
6
+ - 2.5.1
7
+ before_install: gem install bundler -v 1.17.1
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "https://rubygems.org"
2
+
3
+ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in instagram-crawler.gemspec
6
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,59 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ instagram-crawler (0.1.0)
5
+ colorize (~> 0.8)
6
+ http (~> 4.0)
7
+ nokogiri (~> 1.8)
8
+
9
+ GEM
10
+ remote: https://rubygems.org/
11
+ specs:
12
+ addressable (2.5.2)
13
+ public_suffix (>= 2.0.2, < 4.0)
14
+ colorize (0.8.1)
15
+ diff-lcs (1.3)
16
+ domain_name (0.5.20180417)
17
+ unf (>= 0.0.5, < 1.0.0)
18
+ http (4.0.0)
19
+ addressable (~> 2.3)
20
+ http-cookie (~> 1.0)
21
+ http-form_data (~> 2.0)
22
+ http_parser.rb (~> 0.6.0)
23
+ http-cookie (1.0.3)
24
+ domain_name (~> 0.5)
25
+ http-form_data (2.1.1)
26
+ http_parser.rb (0.6.0)
27
+ mini_portile2 (2.3.0)
28
+ nokogiri (1.8.5)
29
+ mini_portile2 (~> 2.3.0)
30
+ public_suffix (3.0.3)
31
+ rake (10.5.0)
32
+ rspec (3.8.0)
33
+ rspec-core (~> 3.8.0)
34
+ rspec-expectations (~> 3.8.0)
35
+ rspec-mocks (~> 3.8.0)
36
+ rspec-core (3.8.0)
37
+ rspec-support (~> 3.8.0)
38
+ rspec-expectations (3.8.2)
39
+ diff-lcs (>= 1.2.0, < 2.0)
40
+ rspec-support (~> 3.8.0)
41
+ rspec-mocks (3.8.0)
42
+ diff-lcs (>= 1.2.0, < 2.0)
43
+ rspec-support (~> 3.8.0)
44
+ rspec-support (3.8.0)
45
+ unf (0.1.4)
46
+ unf_ext
47
+ unf_ext (0.0.7.5)
48
+
49
+ PLATFORMS
50
+ ruby
51
+
52
+ DEPENDENCIES
53
+ bundler (~> 1.17)
54
+ instagram-crawler!
55
+ rake (~> 10.0)
56
+ rspec (~> 3.0)
57
+
58
+ BUNDLED WITH
59
+ 1.17.1
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2018 Leon Ji
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,76 @@
1
+ # Instagram Crawler
2
+
3
+ > The easiest way to download instagram photos, posts and videos.
4
+
5
+ <img src="screenshots/logo.png" width="200" align="center">
6
+
7
+ ### Instagram Crawler is a ruby gem to crawl instagram photos, posts and videos for download.
8
+
9
+ ## Installation
10
+
11
+ ```
12
+ $ gem install instagram-crawler
13
+ ```
14
+
15
+ ## Setting env variable
16
+
17
+ ```
18
+ export sessionid=[your instagram sessionid]
19
+ ```
20
+
21
+ ![](screenshots/sessionid.png)
22
+
23
+ ## Getting Started
24
+
25
+ ![](screenshots/instagram_crawler_demo.gif)
26
+
27
+ ### Show all file link
28
+
29
+ `-u || --user_name`
30
+
31
+ ```ruby
32
+ instagram-crawler -u <user_name>
33
+ ```
34
+
35
+ ### Download files after this date (YYYYMMDD)
36
+
37
+ `-a || --after `
38
+
39
+ ```ruby
40
+ instagram-crawler -u <user_name> -d -a 20181120
41
+ ```
42
+
43
+ ### Generate log file
44
+
45
+ `-l || --log `
46
+
47
+ ```ruby
48
+ instagram-crawler -u <user_name> -l
49
+ ```
50
+
51
+ ### Help
52
+
53
+ `instagram-crawler -h | --help`
54
+
55
+ ```ruby
56
+ Usage:
57
+ instagram-crawler [options]
58
+ See https://github.com/mgleon08/instagram-crawler for more information.
59
+
60
+ options:
61
+ -u, --username USERNAME Instagram username
62
+ -d, --download Download files
63
+ -a, --after DATE Download files after this date (YYYYMMDD)
64
+ -l, --log Generate a log file in the current directory
65
+ -v, --version Show the instagram-crawler version
66
+ -h, --help Show this message
67
+ ```
68
+
69
+ ## Contributing
70
+
71
+ Bug reports and pull requests are welcome on GitHub at [`https://github.com/mgleon08/instagram-crawler/pulls`](https://github.com/mgleon08/instagram-crawler/pulls)
72
+
73
+ ## License
74
+
75
+ * Copyright (c) 2018 Leon Ji. See [LICENSE.txt](https://github.com/mgleon08/instagram-crawler/blob/master/LICENSE.txt) for further details.
76
+ * The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "instagram_crawler"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env ruby
2
+ require_relative '../lib/instagram_crawler'
3
+
4
+ begin
5
+ raise InstagramCrawler::Errors::EnvError if ENV["sessionid"].nil?
6
+ args = InstagramCrawler::Parser::Args.new(ARGV)
7
+ InstagramCrawler::Logger.setting(args.log)
8
+ InstagramCrawler::Main.run
9
+ rescue => e
10
+ $stderr.puts e.message
11
+ exit
12
+ end
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,48 @@
1
+
2
+ lib = File.expand_path("../lib", __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "instagram_crawler/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "instagram-crawler"
8
+ spec.version = InstagramCrawler::VERSION
9
+ spec.authors = ["Leon Ji"]
10
+ spec.email = ["mgleon08@gmail.com"]
11
+
12
+ spec.summary = %q{The easiest way to download instagram photos, posts and videos.}
13
+ spec.description = %q{Crawl instagram photos, posts and videos for download.}
14
+ spec.homepage = "https://github.com/mgleon08/instagram-crawler"
15
+ spec.license = "MIT"
16
+
17
+ # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
18
+ # to allow pushing to a single host or delete this section to allow pushing to any host.
19
+ if spec.respond_to?(:metadata)
20
+ spec.metadata["allowed_push_host"] = "https://rubygems.org/"
21
+
22
+ spec.metadata["homepage_uri"] = spec.homepage
23
+ spec.metadata["source_code_uri"] = "https://github.com/mgleon08/instagram-crawler"
24
+ spec.metadata["changelog_uri"] = "https://github.com/mgleon08/instagram-crawler"
25
+ else
26
+ raise "RubyGems 2.0 or newer is required to protect against " \
27
+ "public gem pushes."
28
+ end
29
+
30
+ # Specify which files should be added to the gem when it is released.
31
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
32
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
33
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
34
+ end
35
+ spec.bindir = "bin"
36
+ spec.executables = ["instagram-crawler"]
37
+ spec.require_paths = ["lib"]
38
+
39
+ spec.required_ruby_version = ">= 2.5.1"
40
+
41
+ spec.add_development_dependency "bundler", "~> 1.17"
42
+ spec.add_development_dependency "rake", "~> 10.0"
43
+ spec.add_development_dependency "rspec", "~> 3.0"
44
+
45
+ spec.add_runtime_dependency "nokogiri", "~> 1.8"
46
+ spec.add_runtime_dependency "http", "~> 4.0"
47
+ spec.add_runtime_dependency "colorize", "~> 0.8"
48
+ end
@@ -0,0 +1,26 @@
1
+ $LOAD_PATH << File.dirname(__FILE__)
2
+
3
+ require "optparse"
4
+ require "http"
5
+ require "nokogiri"
6
+ require "logger"
7
+ require "colorize"
8
+
9
+ require "instagram_crawler/version"
10
+
11
+ require "instagram_crawler/main"
12
+ require "instagram_crawler/file"
13
+ require "instagram_crawler/logger"
14
+ require "instagram_crawler/multi_io"
15
+ require "instagram_crawler/config"
16
+
17
+ # parser
18
+ require "instagram_crawler/parser/args"
19
+ require "instagram_crawler/parser/base"
20
+ require "instagram_crawler/parser/html"
21
+ require "instagram_crawler/parser/json"
22
+
23
+ # error
24
+ require "instagram_crawler/errors/http_error"
25
+ require "instagram_crawler/errors/env_error"
26
+ require "instagram_crawler/errors/arg_error"
@@ -0,0 +1,22 @@
1
+ module InstagramCrawler
2
+ class Config
3
+ @default_url = "https://www.instagram.com".freeze
4
+ class << self
5
+ attr_reader :default_url, :user_name, :base_url, :base_path,
6
+ :log_path, :after_date, :parse_date
7
+ attr_accessor :download
8
+
9
+ def user_name=(user_name)
10
+ @user_name = user_name
11
+ @base_url = "#{default_url}/#{user_name}/"
12
+ @base_path = "./instagram-crawler/#{user_name}"
13
+ @log_path = "./instagram-crawler/#{user_name}/log_file"
14
+ end
15
+
16
+ def after_date=(after_date)
17
+ @after_date = after_date
18
+ @parse_date = Time.parse(after_date).to_i
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,9 @@
1
+ module InstagramCrawler
2
+ module Errors
3
+ class ArgError < StandardError
4
+ def initialize(message)
5
+ super("Missing argument: #{message}")
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module InstagramCrawler
2
+ module Errors
3
+ class EnvError < StandardError
4
+ def initialize
5
+ super("Undefined env variable sessionid\nYou should setting env variable 'export sessionid=[your instagram sessionid]' and execute again.")
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module InstagramCrawler
2
+ module Errors
3
+ class HttpError < StandardError
4
+ def initialize(message)
5
+ super("HttpError: #{message}")
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,34 @@
1
+ module InstagramCrawler
2
+ class File < ::File
3
+ class << self
4
+ def mkdir
5
+ return unless Config.download
6
+ Logger.info "Create directory in #{Config.base_path}\n"
7
+ FileUtils.mkdir_p(Config.base_path) unless Dir.exist?(Config.base_path)
8
+ end
9
+
10
+ def download(url, dir_name, file_name)
11
+ return unless Config.download
12
+ extname = File.extname(url)
13
+
14
+ dir_path = "#{Config.base_path}/#{dir_name}"
15
+ FileUtils.mkdir_p(dir_path) unless Dir.exist?(dir_path)
16
+
17
+ file_path = "#{dir_path}/#{file_name}#{extname}"
18
+ binary_data = get_binary_data(url)
19
+
20
+ File.open(file_path, 'wb') do |f|
21
+ f.write binary_data
22
+ end
23
+ end
24
+
25
+ private
26
+
27
+ def get_binary_data(url)
28
+ res = HTTP.get(url)
29
+ raise Errors::HttpError, "#{res.code} #{res.reason}" if res.code != 200
30
+ res.to_s
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,29 @@
1
+ module InstagramCrawler
2
+ class Logger < ::Logger
3
+ class << self
4
+ def setting(log)
5
+ io = [STDOUT]
6
+ io << log_file if log
7
+ logger = Logger.new(MultiIO.new(*io))
8
+ logger.level = Logger::INFO
9
+ logger.formatter = proc do |_severity, _datetime, _progname, msg|
10
+ "#{msg}\n"
11
+ end
12
+ @logger = logger
13
+ end
14
+
15
+ def info(str)
16
+ @logger.info(str)
17
+ end
18
+
19
+ private
20
+
21
+ def log_file
22
+ time = Time.now.strftime('%Y-%m-%dT%H:%M:%S')
23
+ FileUtils.mkdir_p(Config.log_path) unless Dir.exist?(Config.log_path)
24
+ log_file = File.open("#{Config.log_path}/#{time}.txt", 'a')
25
+ log_file
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,26 @@
1
+ module InstagramCrawler
2
+ class Main
3
+ def self.run
4
+ Logger.info text_to_ASCII.light_red
5
+ Logger.info "Running instagram-crawler v#{InstagramCrawler::VERSION}\n"
6
+ start_time = Time.now
7
+ File.mkdir
8
+ page_info, user_id = Parser::Html.new(Config.base_url).parsing
9
+ Parser::Json.new(page_info, user_id).parsing if page_info["has_next_page"]
10
+ end_time = Time.now
11
+ Logger.info "\nSuccess, all files have been downloaded!".light_green
12
+ end
13
+
14
+ private
15
+
16
+ def self.text_to_ASCII
17
+ <<~USAGE.freeze
18
+ ___ _ ___ _
19
+ |_ _|_ _ __| |_ __ _ __ _ _ _ __ _ _ __ / __|_ _ __ ___ __ _| |___ _ _
20
+ | || ' \\(_-< _/ _` / _` | '_/ _` | ' \\ | (__| '_/ _` \\ V V / / -_) '_|
21
+ |___|_||_/__/\\__\\__,_\\__, |_| \\__,_|_|_|_| \\___|_| \\__,_|\\_/\\_/|_\\___|_|
22
+ |___/
23
+ USAGE
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,36 @@
1
+ module InstagramCrawler
2
+ class MultiIO
3
+ def initialize(*targets)
4
+ @targets = targets
5
+ @shell_colors = {
6
+ light_red: '\\e\\[0;91;49m',
7
+ light_green: '\\e\\[0;92;49m',
8
+ light_yellow: '\\e\\[0;93;49m',
9
+ light_blue: '\\e\\[0;94;49m',
10
+ light_magenta: '\\e\\[0;95;49m',
11
+ light_cyan: '\\e\\[0;96;49m',
12
+ out_put: '\\e\\[0m'
13
+ }
14
+ end
15
+
16
+ def write(args)
17
+ @targets.each do |target|
18
+ pretty_file_text(args) if target.is_a? File
19
+ target.write(args)
20
+ end
21
+ end
22
+
23
+ def close
24
+ @targets.each(&:close)
25
+ end
26
+
27
+ private
28
+
29
+ def pretty_file_text(args)
30
+ @shell_colors.values.each do |color|
31
+ args.gsub!(/#{color}/, '')
32
+ end
33
+ args
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,43 @@
1
+ module InstagramCrawler
2
+ module Parser
3
+ class Args
4
+ attr_accessor :log
5
+
6
+ def initialize(args)
7
+ @args = args
8
+ parse_args
9
+ validates_required_args
10
+ end
11
+
12
+ private
13
+
14
+ def parse_args
15
+ opts = OptionParser.new
16
+ opts.banner = usage_msg
17
+ opts.separator ''
18
+ opts.separator 'options:'
19
+ opts.on('-u', '--username USERNAME', 'Instagram username') { |user_name| Config.user_name = user_name }
20
+ opts.on('-d', '--download', 'Download files') { |download| Config.download = true }
21
+ opts.on('-a', '--after DATE', 'Download files after this date (YYYYMMDD)') { |after_date| Config.after_date = after_date }
22
+ opts.on('-l', '--log', 'Generate a log file in the current directory') { self.log = true }
23
+ opts.on('-v', '--version', 'Show the instagram-crawler version') { puts("instagram-crawler #{InstagramCrawler::VERSION}"); exit }
24
+ opts.on('-h', '--help', 'Show this message') { puts(opts); exit }
25
+ opts.parse!(@args)
26
+ end
27
+
28
+ def usage_msg
29
+ <<~USAGE.freeze
30
+ Usage:
31
+ instagram-crawler [options]
32
+ See https://github.com/mgleon08/instagram-crawler for more information.
33
+ USAGE
34
+ end
35
+
36
+ def validates_required_args
37
+ if Config.user_name.nil?
38
+ raise InstagramCrawler::Errors::ArgError.new('-u or --username')
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,30 @@
1
+ module InstagramCrawler
2
+ module Parser
3
+ class Base
4
+ private
5
+
6
+ def parse_post(posts, time)
7
+ posts.each.with_index(1) do |post, index|
8
+ url = post["node"]["display_url"]
9
+ output(time, url)
10
+ File.download(url, "post/#{time}", "#{index}.#{time}")
11
+ end
12
+ end
13
+
14
+ def output(time, url)
15
+ Logger.info "[#{time}]".light_cyan + " #{url}"
16
+ end
17
+
18
+ def parse_to_date(ts)
19
+ Time.at(ts).strftime('%Y-%m-%dT%H:%M')
20
+ end
21
+
22
+ def check_time(time)
23
+ if Config.after_date && (Config.parse_date > time)
24
+ Logger.info "\nSuccess, the files after #{Config.after_date} have been downloaded!".light_green
25
+ exit
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,79 @@
1
+ module InstagramCrawler
2
+ module Parser
3
+ class Html < Base
4
+ attr_reader :html
5
+
6
+ def initialize(url)
7
+ @html = get_html(url)
8
+ end
9
+
10
+ def parsing
11
+ doc = Nokogiri::HTML(html)
12
+ js_data = doc.at_xpath("//script[contains(text(),'window._sharedData')]")
13
+ json = JSON.parse(js_data.text[21..-2])
14
+ profile = json["entry_data"]["ProfilePage"][0]
15
+ page_info = profile["graphql"]["user"]["edge_owner_to_timeline_media"]['page_info']
16
+ user_id = profile["logging_page_id"].delete("profilePage_")
17
+ edges = profile["graphql"]["user"]["edge_owner_to_timeline_media"]["edges"]
18
+
19
+ loop_edges(edges)
20
+
21
+ return page_info, user_id
22
+ end
23
+
24
+ def parsing_video_page
25
+ doc = Nokogiri::HTML(html)
26
+ meta_v = doc.at_xpath("//meta[@property='og:video']")
27
+ url = meta_v.attribute_nodes.last.value
28
+ end
29
+
30
+ def parsing_photo_page
31
+ doc = Nokogiri::HTML(html)
32
+ js_data = doc.at_xpath("//script[contains(text(),'window._sharedData')]")
33
+ json = JSON.parse(js_data.text[21..-2])
34
+ shortcode_media = json["entry_data"]["PostPage"][0]["graphql"]["shortcode_media"]
35
+
36
+ if shortcode_media["edge_sidecar_to_children"]
37
+ shortcode_media["edge_sidecar_to_children"]["edges"]
38
+ else
39
+ shortcode_media["display_url"]
40
+ end
41
+ end
42
+
43
+ private
44
+
45
+ def loop_edges(edges)
46
+ edges.each do |edge|
47
+ node = edge["node"]
48
+ check_time(node["taken_at_timestamp"])
49
+ time = parse_to_date(node["taken_at_timestamp"])
50
+ page_url = "https://www.instagram.com/p/#{node["shortcode"]}/"
51
+
52
+ if node["is_video"]
53
+ Logger.info "========VIDEO========".light_yellow
54
+ url = Html.new(page_url).parsing_video_page
55
+ output(time, url)
56
+ File.download(url, 'video', time)
57
+ else
58
+ shortcode_media = Html.new(page_url).parsing_photo_page
59
+ if shortcode_media.is_a? Array
60
+ Logger.info "========POST========".light_magenta
61
+ parse_post(shortcode_media, time)
62
+ else
63
+ Logger.info "========PHOTO========".light_green
64
+ url = shortcode_media
65
+ output(time, url)
66
+ File.download(url, 'photo', time)
67
+ end
68
+ end
69
+ end
70
+ end
71
+
72
+ def get_html(url)
73
+ res = HTTP.get(url)
74
+ raise Errors::HttpError, "#{res.code} #{res.reason}" if res.code != 200
75
+ res.to_s
76
+ end
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,60 @@
1
+ module InstagramCrawler
2
+ module Parser
3
+ class Json < Base
4
+ attr_reader :page_info, :user_id
5
+
6
+ def initialize(page_info, user_id)
7
+ @page_info = page_info
8
+ @user_id = user_id
9
+ end
10
+
11
+ def parsing
12
+ begin
13
+ end_cursor = page_info["end_cursor"][0..-3]
14
+ url = next_url(end_cursor, user_id)
15
+ html = get_json(url)
16
+ json = JSON.parse(html)
17
+ @page_info = json["data"]["user"]["edge_owner_to_timeline_media"]["page_info"]
18
+ edges = json["data"]["user"]["edge_owner_to_timeline_media"]["edges"]
19
+
20
+ loop_edges(edges)
21
+ end while page_info["has_next_page"]
22
+ end
23
+
24
+ private
25
+
26
+ def loop_edges(edges)
27
+ edges.each do |edge|
28
+ node = edge["node"]
29
+ check_time(node["taken_at_timestamp"])
30
+ time = parse_to_date(node["taken_at_timestamp"])
31
+
32
+ if node["is_video"]
33
+ Logger.info "========VIDEO========".light_yellow
34
+ url = node["video_url"]
35
+ output(time, url)
36
+ File.download(url, 'video', time)
37
+ elsif !node["edge_sidecar_to_children"].nil?
38
+ Logger.info "========POST========".light_magenta
39
+ parse_post(node["edge_sidecar_to_children"]["edges"], time)
40
+ else
41
+ Logger.info "========PHOTO========".light_green
42
+ url = node["display_url"]
43
+ output(time, node["display_url"])
44
+ File.download(url, 'photo', time)
45
+ end
46
+ end
47
+ end
48
+
49
+ def get_json(url)
50
+ res = HTTP.cookies(sessionid: ENV["sessionid"]).get(url)
51
+ raise Errors::HttpError, "#{res.code} #{res.reason}" if res.code != 200
52
+ res.to_s
53
+ end
54
+
55
+ def next_url(end_cursor, user_id)
56
+ "https://www.instagram.com/graphql/query/?query_hash=f412a8bfd8332a76950fefc1da5785ef&variables=%7B%22id%22%3A%22#{user_id}%22%2C%22first%22%3A12%2C%22after%22%3A%22#{end_cursor}%3D%3D%22%7D"
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,3 @@
1
+ module InstagramCrawler
2
+ VERSION = "0.1.1".freeze
3
+ end
Binary file
Binary file
metadata ADDED
@@ -0,0 +1,162 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: instagram-crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Leon Ji
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2018-11-23 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.17'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.17'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: nokogiri
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.8'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.8'
69
+ - !ruby/object:Gem::Dependency
70
+ name: http
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '4.0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '4.0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: colorize
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '0.8'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '0.8'
97
+ description: Crawl instagram photos, posts and videos for download.
98
+ email:
99
+ - mgleon08@gmail.com
100
+ executables:
101
+ - instagram-crawler
102
+ extensions: []
103
+ extra_rdoc_files: []
104
+ files:
105
+ - ".gitignore"
106
+ - ".rspec"
107
+ - ".travis.yml"
108
+ - Gemfile
109
+ - Gemfile.lock
110
+ - LICENSE.txt
111
+ - README.md
112
+ - Rakefile
113
+ - bin/console
114
+ - bin/instagram-crawler
115
+ - bin/setup
116
+ - instagram-crawler.gemspec
117
+ - lib/instagram_crawler.rb
118
+ - lib/instagram_crawler/config.rb
119
+ - lib/instagram_crawler/errors/arg_error.rb
120
+ - lib/instagram_crawler/errors/env_error.rb
121
+ - lib/instagram_crawler/errors/http_error.rb
122
+ - lib/instagram_crawler/file.rb
123
+ - lib/instagram_crawler/logger.rb
124
+ - lib/instagram_crawler/main.rb
125
+ - lib/instagram_crawler/multi_io.rb
126
+ - lib/instagram_crawler/parser/args.rb
127
+ - lib/instagram_crawler/parser/base.rb
128
+ - lib/instagram_crawler/parser/html.rb
129
+ - lib/instagram_crawler/parser/json.rb
130
+ - lib/instagram_crawler/version.rb
131
+ - screenshots/instagram_crawler_demo.gif
132
+ - screenshots/logo.png
133
+ - screenshots/sessionid.png
134
+ homepage: https://github.com/mgleon08/instagram-crawler
135
+ licenses:
136
+ - MIT
137
+ metadata:
138
+ allowed_push_host: https://rubygems.org/
139
+ homepage_uri: https://github.com/mgleon08/instagram-crawler
140
+ source_code_uri: https://github.com/mgleon08/instagram-crawler
141
+ changelog_uri: https://github.com/mgleon08/instagram-crawler
142
+ post_install_message:
143
+ rdoc_options: []
144
+ require_paths:
145
+ - lib
146
+ required_ruby_version: !ruby/object:Gem::Requirement
147
+ requirements:
148
+ - - ">="
149
+ - !ruby/object:Gem::Version
150
+ version: 2.5.1
151
+ required_rubygems_version: !ruby/object:Gem::Requirement
152
+ requirements:
153
+ - - ">="
154
+ - !ruby/object:Gem::Version
155
+ version: '0'
156
+ requirements: []
157
+ rubyforge_project:
158
+ rubygems_version: 2.7.6
159
+ signing_key:
160
+ specification_version: 4
161
+ summary: The easiest way to download instagram photos, posts and videos.
162
+ test_files: []