RubyGems - zorki - Versions diffs - 0.1.1 - Mend

zorki 0.1.1

Files changed (27) hide show

checksums.yaml +7 -0
data/.github/workflows/main.yml +18 -0
data/.gitignore +12 -0
data/.rubocop.yml +67 -0
data/CHANGELOG.md +5 -0
data/CODE_OF_CONDUCT.md +84 -0
data/Gemfile +17 -0
data/Gemfile.lock +162 -0
data/LICENSE.txt +21 -0
data/README.md +85 -0
data/Rakefile +16 -0
data/bin/console +15 -0
data/bin/setup +8 -0
data/lib/generators/zorki.rb +3 -0
data/lib/generators/zorki_generator.rb +6 -0
data/lib/helpers/configuration.rb +28 -0
data/lib/zorki/monkeypatch.rb +50 -0
data/lib/zorki/post.rb +46 -0
data/lib/zorki/scrapers/post_scraper.rb +125 -0
data/lib/zorki/scrapers/scraper.rb +227 -0
data/lib/zorki/scrapers/user_scraper.rb +74 -0
data/lib/zorki/user.rb +52 -0
data/lib/zorki/version.rb +5 -0
data/lib/zorki.rb +74 -0
data/zorki.gemspec +43 -0
data/zorki.logs +300 -0
metadata +152 -0

data/lib/zorki/scrapers/post_scraper.rb ADDED Viewed

@@ -0,0 +1,125 @@
+# frozen_string_literal: true
+require "typhoeus"
+module Zorki
+  class PostScraper < Scraper
+    def parse(id)
+      # Stuff we need to get from the DOM (implemented is starred):
+      # - User *
+      # - Text *
+      # - Image * / Images * / Video *
+      # - Date *
+      # - Number of likes *
+      # - Hashtags
+      Capybara.app_host = "https://instagram.com"
+      # video slideshows https://www.instagram.com/p/CY7KxwYOFBS/?utm_source=ig_embed&utm_campaign=loading
+      login
+      graphql_object = get_content_of_subpage_from_url(
+        "https://www.instagram.com/p/#{id}/",
+        "/graphql",
+        "data,xdt_api__v1__media__shortcode__web_info,items"
+      )
+      # For pages that have been marked misinfo the structure is very different than not
+      # If it is a clean post then it's just a schema.org thing, but if it's misinfo it's the old
+      # way of deeply nested stuff.
+      #
+      # First we check which one we're getting
+      if graphql_object.has_key?("articleBody")
+        # Let's just parse the images first
+        images = graphql_object["image"].map do |image|
+          Zorki.retrieve_media(image["url"])
+        end
+        text = graphql_object["articleBody"]
+        username = graphql_object["author"]["identifier"]["value"]
+        # 2021-04-01T17:07:10-07:00
+        date = DateTime.strptime(graphql_object["dateCreated"], "%Y-%m-%dT%H:%M:%S%z")
+        interactions = graphql_object["interactionStatistic"]
+        number_of_likes = interactions.select do |x|
+          x["interactionType"] == "http://schema.org/LikeAction"
+        end.first["userInteractionCount"]
+        unless graphql_object["video"].empty?
+          video = graphql_object["video"].first["contentUrl"]
+          video_preview_image = graphql_object["video"].first["thumbnailUrl"]
+        end
+      else
+        # We need to see if this is a single image post or a slideshow. We do that
+        # by looking for a single image, if it's not there, we assume the alternative.
+        graphql_object = graphql_object["data"]["xdt_api__v1__media__shortcode__web_info"]
+        unless graphql_object["items"][0].has_key?("video_versions") && !graphql_object["items"][0]["video_versions"].nil?
+          # Check if there is a slideshow or not
+          unless graphql_object["items"][0].has_key?("carousel_media") && !graphql_object["items"][0]["carousel_media"].nil?
+            # Single image
+            image_url = graphql_object["items"][0]["image_versions2"]["candidates"][0]["url"]
+            images = [Zorki.retrieve_media(image_url)]
+          else
+            # Slideshow
+            images = graphql_object["items"][0]["carousel_media"].map do |media|
+              Zorki.retrieve_media(media["image_versions2"]["candidates"][0]["url"])
+            end
+          end
+        else
+          # some of these I've seen in both ways, thus the commented out lines
+          # video_url = graphql_object["entry_data"]["PostPage"].first["graphql"]["shortcode_media"]["video_url"]
+          video_url = graphql_object["items"][0]["video_versions"][0]["url"]
+          video = Zorki.retrieve_media(video_url)
+          # video_preview_image_url = graphql_object["entry_data"]["PostPage"].first["graphql"]["shortcode_media"]["display_resources"].last["src"]
+          video_preview_image_url = graphql_object["items"][0]["image_versions2"]["candidates"][0]["url"]
+          video_preview_image = Zorki.retrieve_media(video_preview_image_url)
+        end
+        unless graphql_object["items"][0]["caption"].nil?
+          text = graphql_object["items"][0]["caption"]["text"]
+        else
+          text = ""
+        end
+        username = graphql_object["items"][0]["user"]["username"]
+        date = DateTime.strptime(graphql_object["items"][0]["taken_at"].to_s, "%s")
+        number_of_likes = graphql_object["items"][0]["like_count"]
+      end
+      screenshot_file = take_screenshot()
+      # This has to run last since it switches pages
+      user = User.lookup([username]).first
+      page.quit
+      {
+        images: images,
+        video: video,
+        video_preview_image: video_preview_image,
+        screenshot_file: screenshot_file,
+        text: text,
+        date: date,
+        number_of_likes: number_of_likes,
+        user: user,
+        id: id
+      }
+    end
+    def take_screenshot
+      # First check if a post has a fact check overlay, if so, clear it.
+      # The only issue is that this can take *awhile* to search. Not sure what to do about that
+      # since it's Instagram's fault for having such a fucked up obfuscated hierarchy
+      begin
+        find_button("See Post").click
+        sleep(0.1)
+      rescue Capybara::ElementNotFound
+        # Do nothing if the element is not found
+      end
+      # Take the screenshot and return it
+      save_screenshot("#{Zorki.temp_storage_location}/instagram_screenshot_#{SecureRandom.uuid}.png")
+    end
+  end
+end

data/lib/zorki/scrapers/scraper.rb ADDED Viewed

@@ -0,0 +1,227 @@
+# frozen_string_literal: true
+require "capybara/dsl"
+require "dotenv/load"
+require "oj"
+require "selenium-webdriver"
+require "logger"
+require "debug"
+require "securerandom"
+# 2022-06-07 14:15:23 WARN Selenium [DEPRECATION] [:browser_options] :options as a parameter for driver initialization is deprecated. Use :capabilities with an Array of value capabilities/options if necessary instead.
+options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
+options.add_argument("--start-maximized")
+options.add_argument("--no-sandbox")
+options.add_argument("--disable-dev-shm-usage")
+options.add_argument("–-disable-blink-features=AutomationControlled")
+options.add_argument("--disable-extensions")
+options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
+options.add_preference "password_manager_enabled", false
+options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
+Capybara.register_driver :selenium_zorki do |app|
+  client = Selenium::WebDriver::Remote::Http::Default.new
+  client.read_timeout = 60  # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
+  Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
+end
+Capybara.threadsafe = true
+Capybara.default_max_wait_time = 60
+Capybara.reuse_server = true
+module Zorki
+  class Scraper # rubocop:disable Metrics/ClassLength
+    include Capybara::DSL
+    @@logger = Logger.new(STDOUT)
+    @@logger.level = Logger::WARN
+    @@logger.datetime_format = "%Y-%m-%d %H:%M:%S"
+    @@session_id = nil
+    def initialize
+      Capybara.default_driver = :selenium_zorki
+    end
+    # Instagram uses GraphQL (like most of Facebook I think), and returns an object that actually
+    # is used to seed the page. We can just parse this for most things.
+    #
+    # additional_search_params is a comma seperated keys
+    # example: `data,xdt_api__v1__media__shortcode__web_info,items`
+    #
+    # @returns Hash a ruby hash of the JSON data
+    def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil)
+      # So this is fun:
+      # For pages marked as misinformation we have to use one method (interception of requrest) and
+      # for pages that are not, we can just pull the data straight from the page.
+      #
+      # How do we figure out which is which?... for now we'll just run through both and see where we
+      # go with it.
+      # Our user data no longer lives in the graphql object passed initially with the page.
+      # Instead it comes in as part of a subsequent call. This intercepts all calls, checks if it's
+      # the one we want, and then moves on.
+      response_body = nil
+      page.driver.browser.intercept do |request, &continue|
+        # This passes the request forward unmodified, since we only care about the response
+        # puts "checking request: #{request.url}"
+        continue.call(request) && next unless request.url.include?(subpage_search)
+        continue.call(request) do |response|
+          # Check if not a CORS prefetch and finish up if not
+          if response.body.present?
+            check_passed = true
+            unless additional_search_parameters.nil?
+              body_to_check = Oj.load(response.body)
+              search_parameters = additional_search_parameters.split(",")
+              search_parameters.each_with_index do |key, index|
+                break if body_to_check.nil?
+                check_passed = false unless body_to_check.has_key?(key)
+                body_to_check = body_to_check[key]
+              end
+            end
+            response_body = response.body if check_passed == true
+          end
+        end
+      rescue Selenium::WebDriver::Error::WebDriverError
+        # Eat them
+      end
+      # Now that the intercept is set up, we visit the page we want
+      visit(url)
+      # We wait until the correct intercept is processed or we've waited 60 seconds
+      start_time = Time.now
+      # puts "Waiting.... #{url}"
+      sleep(rand(1...10))
+      while response_body.nil? && (Time.now - start_time) < 60
+        sleep(0.1)
+      end
+      page.driver.execute_script("window.stop();")
+      # If this is a page that has not been marked as misinfo we can just pull the data
+      # TODO: put this before the whole load loop
+      if response_body.nil?
+        doc = Nokogiri::HTML(page.driver.browser.page_source)
+        elements = doc.search("script").find_all do |e|
+          e.attributes.has_key?("type") && e.attributes["type"].value == "application/ld+json"
+        end
+        raise ContentUnavailableError if elements&.empty?
+        return Oj.load(elements.first.text)
+      end
+      raise ContentUnavailableError if response_body.nil?
+      Oj.load(response_body)
+    end
+  private
+    ##########
+    # Set the session to use a new user folder in the options!
+    # #####################
+    def reset_selenium
+      options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
+      options.add_argument("--start-maximized")
+      options.add_argument("--no-sandbox")
+      options.add_argument("--disable-dev-shm-usage")
+      options.add_argument("–-disable-blink-features=AutomationControlled")
+      options.add_argument("--disable-extensions")
+      options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
+      options.add_preference "password_manager_enabled", false
+      options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
+      # options.add_argument("--user-data-dir=/tmp/tarun")
+      Capybara.register_driver :selenium do |app|
+        client = Selenium::WebDriver::Remote::Http::Default.new
+        client.read_timeout = 60  # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
+        Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
+      end
+      Capybara.current_driver = :selenium
+    end
+    def login
+      # Reset the sessions so that there's nothing laying around
+      page.quit
+      # Check if we're on a Instagram page already, if not visit it.
+      visit ("https://instagram.com") unless page.driver.browser.current_url.include? "instagram.com"
+      # We don't have to login if we already are
+      begin
+        return if find_field("Search", wait: 10).present?
+      rescue Capybara::ElementNotFound; end
+      # Check if we're redirected to a login page, if we aren't we're already logged in
+      return unless page.has_xpath?('//*[@id="loginForm"]/div/div[3]/button')
+      # Try to log in
+      loop_count = 0
+      while loop_count < 5 do
+        fill_in("username", with: ENV["INSTAGRAM_USER_NAME"])
+        fill_in("password", with: ENV["INSTAGRAM_PASSWORD"])
+        begin
+          click_button("Log in", exact_text: true) # Note: "Log in" (lowercase `in`) instead redirects to Facebook's login page
+        rescue Capybara::ElementNotFound; end # If we can't find it don't break horribly, just keep waiting
+        break unless has_css?('p[data-testid="login-error-message"', wait: 10)
+        loop_count += 1
+        sleep(rand * 10.3)
+      end
+      # Sometimes Instagram just... doesn't let you log in
+      raise "Instagram not accessible" if loop_count == 5
+      # No we don't want to save our login credentials
+      begin
+        click_on("Save Info")
+      rescue Capybara::ElementNotFound; end
+    end
+    def fetch_image(url)
+      request = Typhoeus::Request.new(url, followlocation: true)
+      request.on_complete do |response|
+        if request.success?
+          return request.body
+        elsif request.timed_out?
+          raise Zorki::Error("Fetching image at #{url} timed out")
+        else
+          raise Zorki::Error("Fetching image at #{url} returned non-successful HTTP server response #{request.code}")
+        end
+      end
+    end
+    # Convert a string to an integer
+    def number_string_to_integer(number_string)
+      # First we have to remove any commas in the number or else it all breaks
+      number_string = number_string.delete(",")
+      # Is the last digit not a number? If so, we're going to have to multiply it by some multiplier
+      should_expand = /[0-9]/.match(number_string[-1, 1]).nil?
+      # Get the last index and remove the letter at the end if we should expand
+      last_index = should_expand ? number_string.length - 1 : number_string.length
+      number = number_string[0, last_index].to_f
+      multiplier = 1
+      # Determine the multiplier depending on the letter indicated
+      case number_string[-1, 1]
+      when "m"
+        multiplier = 1_000_000
+      end
+      # Multiply everything and insure we get an integer back
+      (number * multiplier).to_i
+    end
+  end
+end
+require_relative "post_scraper"
+require_relative "user_scraper"

data/lib/zorki/scrapers/user_scraper.rb ADDED Viewed

@@ -0,0 +1,74 @@
+# frozen_string_literal: true
+require "typhoeus"
+module Zorki
+  class UserScraper < Scraper
+    def parse(username)
+      # Stuff we need to get from the DOM (implemented is starred):
+      # - *Name
+      # - *Username
+      # - *No. of posts
+      # - *Verified
+      # - *No. of followers
+      # - *No. of people they follow
+      # - *Profile
+      #   - *description
+      #   - *links
+      # - *Profile image
+      login
+      graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "?username=")
+      if graphql_script.has_key?("author") && !graphql_script["author"].nil?
+        user = graphql_script["author"]
+        # Get the username (to verify we're on the right page here)
+        scraped_username = user["identifier"]["value"]
+        raise Zorki::Error unless username == scraped_username
+        number_of_posts = graphql_script["interactionStatistic"].select do |stat|
+          stat["interactionType"] == "https://schema.org/FilmAction"
+        end.first
+        number_of_followers = graphql_script["interactionStatistic"].select do |stat|
+          stat["interactionType"] == "http://schema.org/FollowAction"
+        end.first
+        profile_image_url = user["image"]
+        {
+          name: user["name"],
+          username: username,
+          number_of_posts: Integer(number_of_posts["userInteractionCount"]),
+          number_of_followers: Integer(number_of_followers["userInteractionCount"]),
+          # number_of_following: user["edge_follow"]["count"],
+          verified: user["is_verified"], # todo
+          profile: graphql_script["description"],
+          profile_link: user["sameAs"],
+          profile_image: Zorki.retrieve_media(profile_image_url),
+          profile_image_url: profile_image_url
+        }
+      else
+        user = graphql_script["data"]["user"]
+        # Get the username (to verify we're on the right page here)
+        scraped_username = user["username"]
+        raise Zorki::Error unless username == scraped_username
+        profile_image_url = user["profile_pic_url_hd"]
+        {
+          name: user["full_name"],
+          username: username,
+          number_of_posts: user["edge_owner_to_timeline_media"]["count"],
+          number_of_followers: user["edge_followed_by"]["count"],
+          number_of_following: user["edge_follow"]["count"],
+          verified: user["is_verified"],
+          profile: user["biography"],
+          profile_link: user["external_url"],
+          profile_image: Zorki.retrieve_media(profile_image_url),
+          profile_image_url: profile_image_url
+        }
+      end
+    end
+  end
+end

data/lib/zorki/user.rb ADDED Viewed

@@ -0,0 +1,52 @@
+# frozen_string_literal: true
+module Zorki
+  class User
+    def self.lookup(usernames = [])
+      # If a single id is passed in we make it the appropriate array
+      usernames = [usernames] unless usernames.kind_of?(Array)
+      # Check that the usernames are at least real usernames
+      # usernames.each { |id| raise Birdsong::Error if !/\A\d+\z/.match(id) }
+      self.scrape(usernames)
+    end
+    attr_reader :name,
+                :username,
+                :number_of_posts,
+                :number_of_followers,
+                :number_of_following,
+                :verified,
+                :profile,
+                :profile_link,
+                :profile_image,
+                :profile_image_url
+  private
+    def initialize(user_hash = {})
+      @name = user_hash[:name]
+      @username = user_hash[:username]
+      @number_of_posts = user_hash[:number_of_posts]
+      @number_of_followers = user_hash[:number_of_followers]
+      @number_of_following = user_hash[:number_of_following]
+      @verified = user_hash[:verified]
+      @profile = user_hash[:profile]
+      @profile_link = user_hash[:profile_link]
+      @profile_image = user_hash[:profile_image]
+      @profile_image_url = user_hash[:profile_image_url]
+    end
+    class << self
+      private
+        def scrape(usernames)
+          usernames.map do |username|
+            user_hash = Zorki::UserScraper.new.parse(username)
+            User.new(user_hash)
+          end
+        end
+    end
+  end
+end

data/lib/zorki/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+# frozen_string_literal: true
+module Zorki
+  VERSION = "0.1.1"
+end

data/lib/zorki.rb ADDED Viewed

@@ -0,0 +1,74 @@
+# frozen_string_literal: true
+require_relative "zorki/version"
+require_relative "zorki/monkeypatch"
+# Representative objects we create
+require_relative "zorki/user"
+require_relative "zorki/post"
+require "helpers/configuration"
+require_relative "zorki/scrapers/scraper"
+module Zorki
+  extend Configuration
+  class Error < StandardError
+    def initialize(msg = "Zorki encountered an error scraping Instagram")
+      super
+    end
+  end
+  class ContentUnavailableError < Error
+    def initialize(msg = "Zorki could not find content requested")
+      super
+    end
+  end
+  class RetryableError < Error; end
+  class ImageRequestTimedOutError < RetryableError
+    def initialize(msg = "Zorki encountered a timeout error requesting an image")
+      super
+    end
+  end
+  class ImageRequestFailedError < RetryableError
+    def initialize(msg = "Zorki received a non-200 response requesting an image")
+      super
+    end
+  end
+  define_setting :temp_storage_location, "tmp/zorki"
+  # Get an image from a URL and save to a temp folder set in the configuration under
+  # temp_storage_location
+  def self.retrieve_media(url)
+    response = Typhoeus.get(url)
+    # Get the file extension if it's in the file
+    stripped_url = url.split("?").first  # remove URL query params
+    extension = stripped_url.split(".").last
+    # Do some basic checks so we just empty out if there's something weird in the file extension
+    # that could do some harm.
+    if extension.length.positive?
+      extension = nil unless /^[a-zA-Z0-9]+$/.match?(extension)
+      extension = ".#{extension}" unless extension.nil?
+    end
+    temp_file_name = "#{Zorki.temp_storage_location}/instagram_media_#{SecureRandom.uuid}#{extension}"
+    # We do this in case the folder isn't created yet, since it's a temp folder we'll just do so
+    self.create_temp_storage_location
+    File.binwrite(temp_file_name, response.body)
+    temp_file_name
+  end
+private
+  def self.create_temp_storage_location
+    return if File.exist?(Zorki.temp_storage_location) && File.directory?(Zorki.temp_storage_location)
+    FileUtils.mkdir_p Zorki.temp_storage_location
+  end
+end

data/zorki.gemspec ADDED Viewed

@@ -0,0 +1,43 @@
+# frozen_string_literal: true
+require_relative "lib/zorki/version"
+Gem::Specification.new do |spec|
+  spec.name          = "zorki"
+  spec.version       = Zorki::VERSION
+  spec.authors       = ["Christopher Guess"]
+  spec.email         = ["cguess@gmail.com"]
+  spec.summary       = "A gem to scrape Instagram pages for archive purposes."
+  # spec.description   = "TODO: Write a longer description or delete this line."
+  # spec.homepage      = "TODO: Put your gem's website or public repo URL here."
+  spec.license       = "MIT"
+  spec.required_ruby_version = Gem::Requirement.new(">= 2.7.0")
+  # spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
+  # spec.metadata["homepage_uri"] = spec.homepage
+  # spec.metadata["source_code_uri"] = "TODO: Put your gem's public repo URL here."
+  # spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
+  # Specify which files should be added to the gem when it is released.
+  # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
+  spec.files = Dir.chdir(File.expand_path(__dir__)) do
+    `git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
+  end
+  spec.bindir        = "exe"
+  spec.executables   = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
+  spec.require_paths = ["lib"]
+  # Uncomment to register a new dependency of your gem
+  # spec.add_dependency "example-gem", "~> 1.0"
+  spec.add_dependency "capybara" # For scraping and running browsers
+  spec.add_dependency "apparition" # A Chrome driver for Capybara
+  spec.add_dependency "typhoeus" # For making API requests
+  spec.add_dependency "oj" # A faster JSON parser/loader than stdlib
+  spec.add_dependency "selenium-webdriver" # Webdriver selenium
+  spec.add_dependency "selenium-devtools" # Allow us to intercept requests
+  # For more information and examples about making a new gem, checkout our
+  # guide at: https://bundler.io/guides/creating_gem.html
+end