instagram-crawler 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 27b1cbc3574a62d01153ab0dd3854eadd933347f9bd10c32488c7b5500b55d4a
4
+ data.tar.gz: 3aa03c9d8c81dd633fc1f57bd3cce5a268bfaf8d0faa54aa0b7af7329aed0123
5
+ SHA512:
6
+ metadata.gz: 64dce2e2e24e0a80b79e1213acc46535e006044ee5902dbd06068d718908c9b96917bb15e9566592192b9e609b60f8bdb7de5e3801f52496b72242df70d773a2
7
+ data.tar.gz: d9ac14681c7731f1d4a6c75bd387b540f68492ed7f1e8c87161e1abdfaf402aef5041835caed0c1d544da73892d512066346f80b35dcf1c8fdcf5bbadfcd12cd
data/.gitignore ADDED
@@ -0,0 +1,11 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+
10
+ # rspec failure tracking
11
+ .rspec_status
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.travis.yml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ sudo: false
3
+ language: ruby
4
+ cache: bundler
5
+ rvm:
6
+ - 2.5.1
7
+ before_install: gem install bundler -v 1.17.1
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "https://rubygems.org"
2
+
3
+ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in instagram-crawler.gemspec
6
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,59 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ instagram-crawler (0.1.0)
5
+ colorize (~> 0.8)
6
+ http (~> 4.0)
7
+ nokogiri (~> 1.8)
8
+
9
+ GEM
10
+ remote: https://rubygems.org/
11
+ specs:
12
+ addressable (2.5.2)
13
+ public_suffix (>= 2.0.2, < 4.0)
14
+ colorize (0.8.1)
15
+ diff-lcs (1.3)
16
+ domain_name (0.5.20180417)
17
+ unf (>= 0.0.5, < 1.0.0)
18
+ http (4.0.0)
19
+ addressable (~> 2.3)
20
+ http-cookie (~> 1.0)
21
+ http-form_data (~> 2.0)
22
+ http_parser.rb (~> 0.6.0)
23
+ http-cookie (1.0.3)
24
+ domain_name (~> 0.5)
25
+ http-form_data (2.1.1)
26
+ http_parser.rb (0.6.0)
27
+ mini_portile2 (2.3.0)
28
+ nokogiri (1.8.5)
29
+ mini_portile2 (~> 2.3.0)
30
+ public_suffix (3.0.3)
31
+ rake (10.5.0)
32
+ rspec (3.8.0)
33
+ rspec-core (~> 3.8.0)
34
+ rspec-expectations (~> 3.8.0)
35
+ rspec-mocks (~> 3.8.0)
36
+ rspec-core (3.8.0)
37
+ rspec-support (~> 3.8.0)
38
+ rspec-expectations (3.8.2)
39
+ diff-lcs (>= 1.2.0, < 2.0)
40
+ rspec-support (~> 3.8.0)
41
+ rspec-mocks (3.8.0)
42
+ diff-lcs (>= 1.2.0, < 2.0)
43
+ rspec-support (~> 3.8.0)
44
+ rspec-support (3.8.0)
45
+ unf (0.1.4)
46
+ unf_ext
47
+ unf_ext (0.0.7.5)
48
+
49
+ PLATFORMS
50
+ ruby
51
+
52
+ DEPENDENCIES
53
+ bundler (~> 1.17)
54
+ instagram-crawler!
55
+ rake (~> 10.0)
56
+ rspec (~> 3.0)
57
+
58
+ BUNDLED WITH
59
+ 1.17.1
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2018 Leon Ji
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,76 @@
1
+ # Instagram Crawler
2
+
3
+ > The easiest way to download instagram photos, posts and videos.
4
+
5
+ <img src="screenshots/logo.png" width="200" align="center">
6
+
7
+ ### Instagram Crawler is a ruby gem to crawl instagram photos, posts and videos for download.
8
+
9
+ ## Installation
10
+
11
+ ```
12
+ $ gem install instagram-crawler
13
+ ```
14
+
15
+ ## Setting env variable
16
+
17
+ ```
18
+ export sessionid=[your instagram sessionid]
19
+ ```
20
+
21
+ ![](screenshots/sessionid.png)
22
+
23
+ ## Getting Started
24
+
25
+ ![](screenshots/instagram_crawler_demo.gif)
26
+
27
+ ### Show all file link
28
+
29
+ `-u || --user_name`
30
+
31
+ ```ruby
32
+ instagram-crawler -u <user_name>
33
+ ```
34
+
35
+ ### Download files after this date (YYYYMMDD)
36
+
37
+ `-a || --after `
38
+
39
+ ```ruby
40
+ instagram-crawler -u <user_name> -d -a 20181120
41
+ ```
42
+
43
+ ### Generate log file
44
+
45
+ `-l || --log `
46
+
47
+ ```ruby
48
+ instagram-crawler -u <user_name> -l
49
+ ```
50
+
51
+ ### Help
52
+
53
+ `instagram-crawler -h | --help`
54
+
55
+ ```ruby
56
+ Usage:
57
+ instagram-crawler [options]
58
+ See https://github.com/mgleon08/instagram-crawler for more information.
59
+
60
+ options:
61
+ -u, --username USERNAME Instagram username
62
+ -d, --download Download files
63
+ -a, --after DATE Download files after this date (YYYYMMDD)
64
+ -l, --log Generate a log file in the current directory
65
+ -v, --version Show the instagram-crawler version
66
+ -h, --help Show this message
67
+ ```
68
+
69
+ ## Contributing
70
+
71
+ Bug reports and pull requests are welcome on GitHub at [`https://github.com/mgleon08/instagram-crawler/pulls`](https://github.com/mgleon08/instagram-crawler/pulls)
72
+
73
+ ## License
74
+
75
+ * Copyright (c) 2018 Leon Ji. See [LICENSE.txt](https://github.com/mgleon08/instagram-crawler/blob/master/LICENSE.txt) for further details.
76
+ * The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "instagram_crawler"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env ruby
2
+ require_relative '../lib/instagram_crawler'
3
+
4
+ begin
5
+ raise InstagramCrawler::Errors::EnvError if ENV["sessionid"].nil?
6
+ args = InstagramCrawler::Parser::Args.new(ARGV)
7
+ InstagramCrawler::Logger.setting(args.log)
8
+ InstagramCrawler::Main.run
9
+ rescue => e
10
+ $stderr.puts e.message
11
+ exit
12
+ end
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,48 @@
1
+
2
+ lib = File.expand_path("../lib", __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "instagram_crawler/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "instagram-crawler"
8
+ spec.version = InstagramCrawler::VERSION
9
+ spec.authors = ["Leon Ji"]
10
+ spec.email = ["mgleon08@gmail.com"]
11
+
12
+ spec.summary = %q{The easiest way to download instagram photos, posts and videos.}
13
+ spec.description = %q{Crawl instagram photos, posts and videos for download.}
14
+ spec.homepage = "https://github.com/mgleon08/instagram-crawler"
15
+ spec.license = "MIT"
16
+
17
+ # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
18
+ # to allow pushing to a single host or delete this section to allow pushing to any host.
19
+ if spec.respond_to?(:metadata)
20
+ spec.metadata["allowed_push_host"] = "https://rubygems.org/"
21
+
22
+ spec.metadata["homepage_uri"] = spec.homepage
23
+ spec.metadata["source_code_uri"] = "https://github.com/mgleon08/instagram-crawler"
24
+ spec.metadata["changelog_uri"] = "https://github.com/mgleon08/instagram-crawler"
25
+ else
26
+ raise "RubyGems 2.0 or newer is required to protect against " \
27
+ "public gem pushes."
28
+ end
29
+
30
+ # Specify which files should be added to the gem when it is released.
31
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
32
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
33
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
34
+ end
35
+ spec.bindir = "bin"
36
+ spec.executables = ["instagram-crawler"]
37
+ spec.require_paths = ["lib"]
38
+
39
+ spec.required_ruby_version = ">= 2.5.1"
40
+
41
+ spec.add_development_dependency "bundler", "~> 1.17"
42
+ spec.add_development_dependency "rake", "~> 10.0"
43
+ spec.add_development_dependency "rspec", "~> 3.0"
44
+
45
+ spec.add_runtime_dependency "nokogiri", "~> 1.8"
46
+ spec.add_runtime_dependency "http", "~> 4.0"
47
+ spec.add_runtime_dependency "colorize", "~> 0.8"
48
+ end
@@ -0,0 +1,26 @@
1
+ $LOAD_PATH << File.dirname(__FILE__)
2
+
3
+ require "optparse"
4
+ require "http"
5
+ require "nokogiri"
6
+ require "logger"
7
+ require "colorize"
8
+
9
+ require "instagram_crawler/version"
10
+
11
+ require "instagram_crawler/main"
12
+ require "instagram_crawler/file"
13
+ require "instagram_crawler/logger"
14
+ require "instagram_crawler/multi_io"
15
+ require "instagram_crawler/config"
16
+
17
+ # parser
18
+ require "instagram_crawler/parser/args"
19
+ require "instagram_crawler/parser/base"
20
+ require "instagram_crawler/parser/html"
21
+ require "instagram_crawler/parser/json"
22
+
23
+ # error
24
+ require "instagram_crawler/errors/http_error"
25
+ require "instagram_crawler/errors/env_error"
26
+ require "instagram_crawler/errors/arg_error"
@@ -0,0 +1,22 @@
1
+ module InstagramCrawler
2
+ class Config
3
+ @default_url = "https://www.instagram.com".freeze
4
+ class << self
5
+ attr_reader :default_url, :user_name, :base_url, :base_path,
6
+ :log_path, :after_date, :parse_date
7
+ attr_accessor :download
8
+
9
+ def user_name=(user_name)
10
+ @user_name = user_name
11
+ @base_url = "#{default_url}/#{user_name}/"
12
+ @base_path = "./instagram-crawler/#{user_name}"
13
+ @log_path = "./instagram-crawler/#{user_name}/log_file"
14
+ end
15
+
16
+ def after_date=(after_date)
17
+ @after_date = after_date
18
+ @parse_date = Time.parse(after_date).to_i
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,9 @@
1
+ module InstagramCrawler
2
+ module Errors
3
+ class ArgError < StandardError
4
+ def initialize(message)
5
+ super("Missing argument: #{message}")
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module InstagramCrawler
2
+ module Errors
3
+ class EnvError < StandardError
4
+ def initialize
5
+ super("Undefined env variable sessionid\nYou should setting env variable 'export sessionid=[your instagram sessionid]' and execute again.")
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module InstagramCrawler
2
+ module Errors
3
+ class HttpError < StandardError
4
+ def initialize(message)
5
+ super("HttpError: #{message}")
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,34 @@
1
+ module InstagramCrawler
2
+ class File < ::File
3
+ class << self
4
+ def mkdir
5
+ return unless Config.download
6
+ Logger.info "Create directory in #{Config.base_path}\n"
7
+ FileUtils.mkdir_p(Config.base_path) unless Dir.exist?(Config.base_path)
8
+ end
9
+
10
+ def download(url, dir_name, file_name)
11
+ return unless Config.download
12
+ extname = File.extname(url)
13
+
14
+ dir_path = "#{Config.base_path}/#{dir_name}"
15
+ FileUtils.mkdir_p(dir_path) unless Dir.exist?(dir_path)
16
+
17
+ file_path = "#{dir_path}/#{file_name}#{extname}"
18
+ binary_data = get_binary_data(url)
19
+
20
+ File.open(file_path, 'wb') do |f|
21
+ f.write binary_data
22
+ end
23
+ end
24
+
25
+ private
26
+
27
+ def get_binary_data(url)
28
+ res = HTTP.get(url)
29
+ raise Errors::HttpError, "#{res.code} #{res.reason}" if res.code != 200
30
+ res.to_s
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,29 @@
1
+ module InstagramCrawler
2
+ class Logger < ::Logger
3
+ class << self
4
+ def setting(log)
5
+ io = [STDOUT]
6
+ io << log_file if log
7
+ logger = Logger.new(MultiIO.new(*io))
8
+ logger.level = Logger::INFO
9
+ logger.formatter = proc do |_severity, _datetime, _progname, msg|
10
+ "#{msg}\n"
11
+ end
12
+ @logger = logger
13
+ end
14
+
15
+ def info(str)
16
+ @logger.info(str)
17
+ end
18
+
19
+ private
20
+
21
+ def log_file
22
+ time = Time.now.strftime('%Y-%m-%dT%H:%M:%S')
23
+ FileUtils.mkdir_p(Config.log_path) unless Dir.exist?(Config.log_path)
24
+ log_file = File.open("#{Config.log_path}/#{time}.txt", 'a')
25
+ log_file
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,26 @@
1
+ module InstagramCrawler
2
+ class Main
3
+ def self.run
4
+ Logger.info text_to_ASCII.light_red
5
+ Logger.info "Running instagram-crawler v#{InstagramCrawler::VERSION}\n"
6
+ start_time = Time.now
7
+ File.mkdir
8
+ page_info, user_id = Parser::Html.new(Config.base_url).parsing
9
+ Parser::Json.new(page_info, user_id).parsing if page_info["has_next_page"]
10
+ end_time = Time.now
11
+ Logger.info "\nSuccess, all files have been downloaded!".light_green
12
+ end
13
+
14
+ private
15
+
16
+ def self.text_to_ASCII
17
+ <<~USAGE.freeze
18
+ ___ _ ___ _
19
+ |_ _|_ _ __| |_ __ _ __ _ _ _ __ _ _ __ / __|_ _ __ ___ __ _| |___ _ _
20
+ | || ' \\(_-< _/ _` / _` | '_/ _` | ' \\ | (__| '_/ _` \\ V V / / -_) '_|
21
+ |___|_||_/__/\\__\\__,_\\__, |_| \\__,_|_|_|_| \\___|_| \\__,_|\\_/\\_/|_\\___|_|
22
+ |___/
23
+ USAGE
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,36 @@
1
+ module InstagramCrawler
2
+ class MultiIO
3
+ def initialize(*targets)
4
+ @targets = targets
5
+ @shell_colors = {
6
+ light_red: '\\e\\[0;91;49m',
7
+ light_green: '\\e\\[0;92;49m',
8
+ light_yellow: '\\e\\[0;93;49m',
9
+ light_blue: '\\e\\[0;94;49m',
10
+ light_magenta: '\\e\\[0;95;49m',
11
+ light_cyan: '\\e\\[0;96;49m',
12
+ out_put: '\\e\\[0m'
13
+ }
14
+ end
15
+
16
+ def write(args)
17
+ @targets.each do |target|
18
+ pretty_file_text(args) if target.is_a? File
19
+ target.write(args)
20
+ end
21
+ end
22
+
23
+ def close
24
+ @targets.each(&:close)
25
+ end
26
+
27
+ private
28
+
29
+ def pretty_file_text(args)
30
+ @shell_colors.values.each do |color|
31
+ args.gsub!(/#{color}/, '')
32
+ end
33
+ args
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,43 @@
1
+ module InstagramCrawler
2
+ module Parser
3
+ class Args
4
+ attr_accessor :log
5
+
6
+ def initialize(args)
7
+ @args = args
8
+ parse_args
9
+ validates_required_args
10
+ end
11
+
12
+ private
13
+
14
+ def parse_args
15
+ opts = OptionParser.new
16
+ opts.banner = usage_msg
17
+ opts.separator ''
18
+ opts.separator 'options:'
19
+ opts.on('-u', '--username USERNAME', 'Instagram username') { |user_name| Config.user_name = user_name }
20
+ opts.on('-d', '--download', 'Download files') { |download| Config.download = true }
21
+ opts.on('-a', '--after DATE', 'Download files after this date (YYYYMMDD)') { |after_date| Config.after_date = after_date }
22
+ opts.on('-l', '--log', 'Generate a log file in the current directory') { self.log = true }
23
+ opts.on('-v', '--version', 'Show the instagram-crawler version') { puts("instagram-crawler #{InstagramCrawler::VERSION}"); exit }
24
+ opts.on('-h', '--help', 'Show this message') { puts(opts); exit }
25
+ opts.parse!(@args)
26
+ end
27
+
28
+ def usage_msg
29
+ <<~USAGE.freeze
30
+ Usage:
31
+ instagram-crawler [options]
32
+ See https://github.com/mgleon08/instagram-crawler for more information.
33
+ USAGE
34
+ end
35
+
36
+ def validates_required_args
37
+ if Config.user_name.nil?
38
+ raise InstagramCrawler::Errors::ArgError.new('-u or --username')
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,30 @@
1
+ module InstagramCrawler
2
+ module Parser
3
+ class Base
4
+ private
5
+
6
+ def parse_post(posts, time)
7
+ posts.each.with_index(1) do |post, index|
8
+ url = post["node"]["display_url"]
9
+ output(time, url)
10
+ File.download(url, "post/#{time}", "#{index}.#{time}")
11
+ end
12
+ end
13
+
14
+ def output(time, url)
15
+ Logger.info "[#{time}]".light_cyan + " #{url}"
16
+ end
17
+
18
+ def parse_to_date(ts)
19
+ Time.at(ts).strftime('%Y-%m-%dT%H:%M')
20
+ end
21
+
22
+ def check_time(time)
23
+ if Config.after_date && (Config.parse_date > time)
24
+ Logger.info "\nSuccess, the files after #{Config.after_date} have been downloaded!".light_green
25
+ exit
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,79 @@
1
+ module InstagramCrawler
2
+ module Parser
3
+ class Html < Base
4
+ attr_reader :html
5
+
6
+ def initialize(url)
7
+ @html = get_html(url)
8
+ end
9
+
10
+ def parsing
11
+ doc = Nokogiri::HTML(html)
12
+ js_data = doc.at_xpath("//script[contains(text(),'window._sharedData')]")
13
+ json = JSON.parse(js_data.text[21..-2])
14
+ profile = json["entry_data"]["ProfilePage"][0]
15
+ page_info = profile["graphql"]["user"]["edge_owner_to_timeline_media"]['page_info']
16
+ user_id = profile["logging_page_id"].delete("profilePage_")
17
+ edges = profile["graphql"]["user"]["edge_owner_to_timeline_media"]["edges"]
18
+
19
+ loop_edges(edges)
20
+
21
+ return page_info, user_id
22
+ end
23
+
24
+ def parsing_video_page
25
+ doc = Nokogiri::HTML(html)
26
+ meta_v = doc.at_xpath("//meta[@property='og:video']")
27
+ url = meta_v.attribute_nodes.last.value
28
+ end
29
+
30
+ def parsing_photo_page
31
+ doc = Nokogiri::HTML(html)
32
+ js_data = doc.at_xpath("//script[contains(text(),'window._sharedData')]")
33
+ json = JSON.parse(js_data.text[21..-2])
34
+ shortcode_media = json["entry_data"]["PostPage"][0]["graphql"]["shortcode_media"]
35
+
36
+ if shortcode_media["edge_sidecar_to_children"]
37
+ shortcode_media["edge_sidecar_to_children"]["edges"]
38
+ else
39
+ shortcode_media["display_url"]
40
+ end
41
+ end
42
+
43
+ private
44
+
45
+ def loop_edges(edges)
46
+ edges.each do |edge|
47
+ node = edge["node"]
48
+ check_time(node["taken_at_timestamp"])
49
+ time = parse_to_date(node["taken_at_timestamp"])
50
+ page_url = "https://www.instagram.com/p/#{node["shortcode"]}/"
51
+
52
+ if node["is_video"]
53
+ Logger.info "========VIDEO========".light_yellow
54
+ url = Html.new(page_url).parsing_video_page
55
+ output(time, url)
56
+ File.download(url, 'video', time)
57
+ else
58
+ shortcode_media = Html.new(page_url).parsing_photo_page
59
+ if shortcode_media.is_a? Array
60
+ Logger.info "========POST========".light_magenta
61
+ parse_post(shortcode_media, time)
62
+ else
63
+ Logger.info "========PHOTO========".light_green
64
+ url = shortcode_media
65
+ output(time, url)
66
+ File.download(url, 'photo', time)
67
+ end
68
+ end
69
+ end
70
+ end
71
+
72
+ def get_html(url)
73
+ res = HTTP.get(url)
74
+ raise Errors::HttpError, "#{res.code} #{res.reason}" if res.code != 200
75
+ res.to_s
76
+ end
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,60 @@
1
+ module InstagramCrawler
2
+ module Parser
3
+ class Json < Base
4
+ attr_reader :page_info, :user_id
5
+
6
+ def initialize(page_info, user_id)
7
+ @page_info = page_info
8
+ @user_id = user_id
9
+ end
10
+
11
+ def parsing
12
+ begin
13
+ end_cursor = page_info["end_cursor"][0..-3]
14
+ url = next_url(end_cursor, user_id)
15
+ html = get_json(url)
16
+ json = JSON.parse(html)
17
+ @page_info = json["data"]["user"]["edge_owner_to_timeline_media"]["page_info"]
18
+ edges = json["data"]["user"]["edge_owner_to_timeline_media"]["edges"]
19
+
20
+ loop_edges(edges)
21
+ end while page_info["has_next_page"]
22
+ end
23
+
24
+ private
25
+
26
+ def loop_edges(edges)
27
+ edges.each do |edge|
28
+ node = edge["node"]
29
+ check_time(node["taken_at_timestamp"])
30
+ time = parse_to_date(node["taken_at_timestamp"])
31
+
32
+ if node["is_video"]
33
+ Logger.info "========VIDEO========".light_yellow
34
+ url = node["video_url"]
35
+ output(time, url)
36
+ File.download(url, 'video', time)
37
+ elsif !node["edge_sidecar_to_children"].nil?
38
+ Logger.info "========POST========".light_magenta
39
+ parse_post(node["edge_sidecar_to_children"]["edges"], time)
40
+ else
41
+ Logger.info "========PHOTO========".light_green
42
+ url = node["display_url"]
43
+ output(time, node["display_url"])
44
+ File.download(url, 'photo', time)
45
+ end
46
+ end
47
+ end
48
+
49
+ def get_json(url)
50
+ res = HTTP.cookies(sessionid: ENV["sessionid"]).get(url)
51
+ raise Errors::HttpError, "#{res.code} #{res.reason}" if res.code != 200
52
+ res.to_s
53
+ end
54
+
55
+ def next_url(end_cursor, user_id)
56
+ "https://www.instagram.com/graphql/query/?query_hash=f412a8bfd8332a76950fefc1da5785ef&variables=%7B%22id%22%3A%22#{user_id}%22%2C%22first%22%3A12%2C%22after%22%3A%22#{end_cursor}%3D%3D%22%7D"
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,3 @@
1
+ module InstagramCrawler
2
+ VERSION = "0.1.1".freeze
3
+ end
Binary file
Binary file
metadata ADDED
@@ -0,0 +1,162 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: instagram-crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Leon Ji
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2018-11-23 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.17'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.17'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: nokogiri
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.8'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.8'
69
+ - !ruby/object:Gem::Dependency
70
+ name: http
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '4.0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '4.0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: colorize
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '0.8'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '0.8'
97
+ description: Crawl instagram photos, posts and videos for download.
98
+ email:
99
+ - mgleon08@gmail.com
100
+ executables:
101
+ - instagram-crawler
102
+ extensions: []
103
+ extra_rdoc_files: []
104
+ files:
105
+ - ".gitignore"
106
+ - ".rspec"
107
+ - ".travis.yml"
108
+ - Gemfile
109
+ - Gemfile.lock
110
+ - LICENSE.txt
111
+ - README.md
112
+ - Rakefile
113
+ - bin/console
114
+ - bin/instagram-crawler
115
+ - bin/setup
116
+ - instagram-crawler.gemspec
117
+ - lib/instagram_crawler.rb
118
+ - lib/instagram_crawler/config.rb
119
+ - lib/instagram_crawler/errors/arg_error.rb
120
+ - lib/instagram_crawler/errors/env_error.rb
121
+ - lib/instagram_crawler/errors/http_error.rb
122
+ - lib/instagram_crawler/file.rb
123
+ - lib/instagram_crawler/logger.rb
124
+ - lib/instagram_crawler/main.rb
125
+ - lib/instagram_crawler/multi_io.rb
126
+ - lib/instagram_crawler/parser/args.rb
127
+ - lib/instagram_crawler/parser/base.rb
128
+ - lib/instagram_crawler/parser/html.rb
129
+ - lib/instagram_crawler/parser/json.rb
130
+ - lib/instagram_crawler/version.rb
131
+ - screenshots/instagram_crawler_demo.gif
132
+ - screenshots/logo.png
133
+ - screenshots/sessionid.png
134
+ homepage: https://github.com/mgleon08/instagram-crawler
135
+ licenses:
136
+ - MIT
137
+ metadata:
138
+ allowed_push_host: https://rubygems.org/
139
+ homepage_uri: https://github.com/mgleon08/instagram-crawler
140
+ source_code_uri: https://github.com/mgleon08/instagram-crawler
141
+ changelog_uri: https://github.com/mgleon08/instagram-crawler
142
+ post_install_message:
143
+ rdoc_options: []
144
+ require_paths:
145
+ - lib
146
+ required_ruby_version: !ruby/object:Gem::Requirement
147
+ requirements:
148
+ - - ">="
149
+ - !ruby/object:Gem::Version
150
+ version: 2.5.1
151
+ required_rubygems_version: !ruby/object:Gem::Requirement
152
+ requirements:
153
+ - - ">="
154
+ - !ruby/object:Gem::Version
155
+ version: '0'
156
+ requirements: []
157
+ rubyforge_project:
158
+ rubygems_version: 2.7.6
159
+ signing_key:
160
+ specification_version: 4
161
+ summary: The easiest way to download instagram photos, posts and videos.
162
+ test_files: []