RubyGems - birdsong - Versions diffs - 0.2.1 → 0.2.3 - Mend

birdsong 0.2.1 → 0.2.3

Files changed (7) hide show

checksums.yaml +4 -4
data/lib/birdsong/monkeypatch.rb +52 -0
data/lib/birdsong/scrapers/scraper.rb +223 -0
data/lib/birdsong/scrapers/tweet_scraper.rb +112 -0
data/lib/birdsong/version.rb +1 -1
data/lib/birdsong.rb +1 -1
metadata +5 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 22fda7c7cab000c5a34df61c63e8ee422037ee23a7e7a238a9dff4065bfb6527
-  data.tar.gz: 789744baff3ff5803d99f9c6de95773093bb7199e5c20fd9424faefc87d3bbb9
+  metadata.gz: e225de345219a482d98dc634601169cb2ab42c78cc9105574fb426f34d334980
+  data.tar.gz: 39f26a882f4e5939012fef4f64b30dabe5b28983a8d21fee6723b75a7342f889
 SHA512:
-  metadata.gz: f754cc423c11cf4829f07887567499b0dc537e9755e66351f9106306f8ed9593c43ff351aae297ae0c12c157ac73331157ac2f216b3386c35df8d612d66643c5
-  data.tar.gz: e1d7725cc7c1c9625d2d9b996e8e24a861c83502c6615d72cd686979dbfda969940dd957900eff0d21d6045d3210a9fe2a6354a03d787452a4d0fadcc3250a07
+  metadata.gz: cac7812476ce19901ac91d1701bf8d01772156d1cf1e9e64d4cd17e9a47d1e8cf60e5f8c05b97c387576f0a910a89aeba1aaca9753764107b4cda6f11de97efe
+  data.tar.gz: c50a4320302b0f87ae09be8b445faa52b7935e4177ffa3804de1ade10faad1c4377590750ef3c369d57d977f0e45022db6898dd7a3901426f0cadcfd0007a4e4

data/lib/birdsong/monkeypatch.rb ADDED Viewed

@@ -0,0 +1,52 @@
+require "logger"
+require "selenium-webdriver"
+# Design taken from https://blog.appsignal.com/2021/08/24/responsible-monkeypatching-in-ruby.html
+module SeleniumMonkeypatch
+  class << self
+    @@logger = Logger.new(STDOUT)
+    @@logger.level = Logger::INFO
+    def apply_patch
+      target_class = find_class
+      target_method = find_method(target_class)
+      unless target_method
+        raise "Could not find class or method when patching Selenium::WebDriver::DevTools.send_cmd"
+      end
+      @@logger.info "#{__FILE__} is monkeypatching Selenium::WebDriver::DevTools.send_cmd"
+      target_class.prepend(InstanceMethods)
+    end
+    private
+      def find_class
+        Kernel.const_get("Selenium::WebDriver::DevTools")
+      rescue NameError
+      end
+      def find_method(class_)
+        return unless class_
+        class_.instance_method(:send_cmd)
+      rescue NameError
+      end
+  end
+  module InstanceMethods
+    # We're monkeypatching the following method so that Selenium doesn't raise errors when we fail to call `continue` on requests
+    def send_cmd(method, **params)
+      data = { method: method, params: params.compact }
+      data[:sessionId] = @session_id if @session_id
+      message = @ws.send_cmd(**data)
+      if message.nil? == false && message["error"] && (method != "Fetch.continueRequest")
+        raise Birdsong::Error::WebDriverError, error_message(message["error"])
+      end
+      message
+    end
+  end
+end
+SeleniumMonkeypatch.apply_patch

data/lib/birdsong/scrapers/scraper.rb ADDED Viewed

@@ -0,0 +1,223 @@
+# frozen_string_literal: true
+require "capybara/dsl"
+require "dotenv/load"
+require "oj"
+require "selenium-webdriver"
+require "logger"
+require "securerandom"
+require "selenium/webdriver/remote/http/curb"
+require "debug"
+# 2022-06-07 14:15:23 WARN Selenium [DEPRECATION] [:browser_options] :options as a parameter for driver initialization is deprecated. Use :capabilities with an Array of value capabilities/options if necessary instead.
+options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
+options.add_argument("--start-maximized")
+options.add_argument("--no-sandbox")
+options.add_argument("--disable-dev-shm-usage")
+options.add_argument("–-disable-blink-features=AutomationControlled")
+options.add_argument("--disable-extensions")
+options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
+options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
+options.add_preference "password_manager_enabled", false
+options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
+Capybara.register_driver :selenium_birdsong do |app|
+  client = Selenium::WebDriver::Remote::Http::Curb.new
+  # client.read_timeout = 60  # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
+  Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
+end
+Capybara.threadsafe = true
+Capybara.default_max_wait_time = 60
+Capybara.reuse_server = true
+module Birdsong
+  class Scraper # rubocop:disable Metrics/ClassLength
+    include Capybara::DSL
+    @@logger = Logger.new(STDOUT)
+    @@logger.level = Logger::WARN
+    @@logger.datetime_format = "%Y-%m-%d %H:%M:%S"
+    @@session_id = nil
+    def initialize
+      Capybara.default_driver = :selenium_birdsong
+    end
+    # Instagram uses GraphQL (like most of Facebook I think), and returns an object that actually
+    # is used to seed the page. We can just parse this for most things.
+    #
+    # additional_search_params is a comma seperated keys
+    # example: `data,xdt_api__v1__media__shortcode__web_info,items`
+    #
+    # @returns Hash a ruby hash of the JSON data
+    def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil)
+      # So this is fun:
+      # For pages marked as misinformation we have to use one method (interception of requrest) and
+      # for pages that are not, we can just pull the data straight from the page.
+      #
+      # How do we figure out which is which?... for now we'll just run through both and see where we
+      # go with it.
+      # Our user data no longer lives in the graphql object passed initially with the page.
+      # Instead it comes in as part of a subsequent call. This intercepts all calls, checks if it's
+      # the one we want, and then moves on.
+      response_body = nil
+      page.driver.browser.intercept do |request, &continue|
+        # This passes the request forward unmodified, since we only care about the response
+        # puts "checking request: #{request.url}"
+        continue.call(request) && next unless request.url.include?(subpage_search)
+        continue.call(request) do |response|
+          # Check if not a CORS prefetch and finish up if not
+          if !response.body.empty? && response.body
+            check_passed = true
+            unless additional_search_parameters.nil?
+              body_to_check = Oj.load(response.body)
+              search_parameters = additional_search_parameters.split(",")
+              search_parameters.each_with_index do |key, index|
+                break if body_to_check.nil?
+                check_passed = false unless body_to_check.has_key?(key)
+                body_to_check = body_to_check[key]
+              end
+            end
+            response_body = response.body if check_passed == true
+          end
+        end
+      rescue Selenium::WebDriver::Error::WebDriverError
+        # Eat them
+      end
+      # Now that the intercept is set up, we visit the page we want
+      page.driver.browser.navigate.to(url)
+      # We wait until the correct intercept is processed or we've waited 60 seconds
+      start_time = Time.now
+      # puts "Waiting.... #{url}"
+      sleep(rand(1...10))
+      while response_body.nil? && (Time.now - start_time) < 60
+        sleep(0.1)
+      end
+      page.driver.execute_script("window.stop();")
+      raise Birdsong::NoTweetFoundError if response_body.nil?
+      Oj.load(response_body)
+    end
+  private
+    ##########
+    # Set the session to use a new user folder in the options!
+    # #####################
+    def reset_selenium
+      options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
+      options.add_argument("--start-maximized")
+      options.add_argument("--no-sandbox")
+      options.add_argument("--disable-dev-shm-usage")
+      options.add_argument("–-disable-blink-features=AutomationControlled")
+      options.add_argument("--disable-extensions")
+      options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
+      options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
+      options.add_preference "password_manager_enabled", false
+      options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
+      # options.add_argument("--user-data-dir=/tmp/tarun")
+      Capybara.register_driver :selenium do |app|
+        client = Selenium::WebDriver::Remote::Http::Curb.new
+        # client.read_timeout = 60  # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
+        Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
+      end
+      Capybara.current_driver = :selenium
+    end
+    def login
+      # Reset the sessions so that there's nothing laying around
+      page.quit
+      # Check if we're on a Instagram page already, if not visit it.
+      unless page.driver.browser.current_url.include? "instagram.com"
+        # There seems to be a bug in the Linux ARM64 version of chromedriver where this will properly
+        # navigate but then timeout, crashing it all up. So instead we check and raise the error when
+        # that then fails again.
+        page.driver.browser.navigate.to("https://instagram.com")
+      end
+      # We don't have to login if we already are
+      begin
+        return if find_field("Search", wait: 10).present?
+      rescue Capybara::ElementNotFound; end
+      # Check if we're redirected to a login page, if we aren't we're already logged in
+      return unless page.has_xpath?('//*[@id="loginForm"]/div/div[3]/button')
+      # Try to log in
+      loop_count = 0
+      while loop_count < 5 do
+        fill_in("username", with: ENV["INSTAGRAM_USER_NAME"])
+        fill_in("password", with: ENV["INSTAGRAM_PASSWORD"])
+        begin
+          click_button("Log in", exact_text: true) # Note: "Log in" (lowercase `in`) instead redirects to Facebook's login page
+        rescue Capybara::ElementNotFound; end # If we can't find it don't break horribly, just keep waiting
+        break unless has_css?('p[data-testid="login-error-message"', wait: 10)
+        loop_count += 1
+        sleep(rand * 10.3)
+      end
+      # Sometimes Instagram just... doesn't let you log in
+      raise "Instagram not accessible" if loop_count == 5
+      # No we don't want to save our login credentials
+      begin
+        click_on("Save Info")
+      rescue Capybara::ElementNotFound; end
+    end
+    def fetch_image(url)
+      request = Typhoeus::Request.new(url, followlocation: true)
+      request.on_complete do |response|
+        if request.success?
+          return request.body
+        elsif request.timed_out?
+          raise Zorki::Error("Fetching image at #{url} timed out")
+        else
+          raise Zorki::Error("Fetching image at #{url} returned non-successful HTTP server response #{request.code}")
+        end
+      end
+    end
+    # Convert a string to an integer
+    def number_string_to_integer(number_string)
+      # First we have to remove any commas in the number or else it all breaks
+      number_string = number_string.delete(",")
+      # Is the last digit not a number? If so, we're going to have to multiply it by some multiplier
+      should_expand = /[0-9]/.match(number_string[-1, 1]).nil?
+      # Get the last index and remove the letter at the end if we should expand
+      last_index = should_expand ? number_string.length - 1 : number_string.length
+      number = number_string[0, last_index].to_f
+      multiplier = 1
+      # Determine the multiplier depending on the letter indicated
+      case number_string[-1, 1]
+      when "m"
+        multiplier = 1_000_000
+      end
+      # Multiply everything and insure we get an integer back
+      (number * multiplier).to_i
+    end
+  end
+end
+# require_relative "tweet_scraper"

data/lib/birdsong/scrapers/tweet_scraper.rb ADDED Viewed

@@ -0,0 +1,112 @@
+# frozen_string_literal: true
+require "typhoeus"
+require_relative "scraper"
+module Birdsong
+  class TweetScraper < Scraper
+    def parse(id)
+      # Stuff we need to get from the DOM (implemented is starred):
+      # - User *
+      # - Text *
+      # - Image * / Images * / Video *
+      # - Date *
+      # - Number of likes *
+      # - Hashtags
+      Capybara.app_host = "https://twitter.com"
+      # video slideshows https://www.instagram.com/p/CY7KxwYOFBS/?utm_source=ig_embed&utm_campaign=loading
+      # login
+      graphql_object = get_content_of_subpage_from_url(
+        "https://twitter.com/jack/status/#{id}",
+        "/graphql",
+        "data,tweetResult,result"
+      )
+      graphql_object = graphql_object.first if graphql_object.kind_of?(Array)
+      graphql_object = graphql_object["data"]["tweetResult"]["result"]
+      if graphql_object.key?("__typename") && graphql_object["__typename"] == "TweetUnavailable"
+        raise Birdsong::NoTweetFoundError
+      end
+      text = graphql_object["legacy"]["full_text"]
+      date = graphql_object["legacy"]["created_at"]
+      id   = graphql_object["legacy"]["id_str"]
+      number_of_likes = graphql_object["legacy"]["favorite_count"]
+      language = graphql_object["legacy"]["lang"]
+      images = []
+      videos = []
+      video_preview_image = nil
+      video_file_type = nil
+      if graphql_object["legacy"]["entities"].key?("media")
+        graphql_object["legacy"]["entities"]["media"].each do |media|
+          case media["type"]
+          when "photo"
+            images << Birdsong.retrieve_media(media["media_url_https"])
+          when "video"
+            video_preview_image = Birdsong.retrieve_media(media["media_url_https"])
+            video_variants = media["video_info"]["variants"]
+            largest_bitrate_variant = video_variants.sort_by do |variant|
+              variant["bitrate"].nil? ? 0 : variant["bitrate"]
+            end.last
+            videos << Birdsong.retrieve_media(largest_bitrate_variant["url"])
+            video_file_type = "video"
+          when "animated_gif"
+            video_preview_image = Birdsong.retrieve_media(media["media_url_https"])
+            videos << media["video_info"]["variants"].first["url"]
+            video_file_type = "animated_gif"
+          end
+        end
+      end
+      screenshot_file = take_screenshot()
+      # This has to run last since it switches pages
+      user_object = graphql_object["core"]["user_results"]["result"]
+      user = {
+        id: user_object["id"],
+        name: user_object["legacy"]["name"],
+        username: user_object["legacy"]["screen_name"],
+        sign_up_date: user_object["legacy"]["created_at"],
+        location: user_object["legacy"]["location"],
+        profile_image_url: user_object["legacy"]["profile_image_url_https"],
+        description: user_object["legacy"]["description"],
+        followers_count: user_object["legacy"]["followers_count"],
+        following_count: user_object["legacy"]["friends_count"],
+        tweet_count: user_object["legacy"]["statuses_count"],
+        listed_count: user_object["legacy"]["listed_count"],
+        verified: user_object["legacy"]["verified"],
+        url: user_object["legacy"]["url"],
+        profile_image_file_name: Birdsong.retrieve_media(user_object["legacy"]["profile_image_url_https"])
+      }
+      page.quit
+      {
+        images: images,
+        video: videos,
+        video_preview_image: video_preview_image,
+        screenshot_file: screenshot_file,
+        text: text,
+        date: date,
+        number_of_likes: number_of_likes,
+        user: user,
+        id: id,
+        language: language,
+        video_file_type: video_file_type
+      }
+    end
+    def take_screenshot
+      # First check if a post has a fact check overlay, if so, clear it.
+      # The only issue is that this can take *awhile* to search. Not sure what to do about that
+      # since it's Instagram's fault for having such a fucked up obfuscated hierarchy      # Take the screenshot and return it
+      save_screenshot("#{Birdsong.temp_storage_location}/instagram_screenshot_#{SecureRandom.uuid}.png")
+    end
+  end
+end

data/lib/birdsong/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Birdsong
-  VERSION = "0.2.1"
+  VERSION = "0.2.3"
 end

data/lib/birdsong.rb CHANGED Viewed

@@ -10,7 +10,7 @@ require "fileutils"
 require_relative "birdsong/version"
 require_relative "birdsong/tweet"
 require_relative "birdsong/user"
-# require_relative "birdsong/scrapers/scraper"
+require_relative "birdsong/scrapers/scraper"
 require_relative "birdsong/scrapers/tweet_scraper"
 require_relative "birdsong/monkeypatch"

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: birdsong
 version: !ruby/object:Gem::Version
-  version: 0.2.1
+  version: 0.2.3
 platform: ruby
 authors:
 - Christopher Guess
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-10-04 00:00:00.000000000 Z
+date: 2023-10-10 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: typhoeus
@@ -172,6 +172,9 @@ files:
 - bin/setup
 - birdsong.gemspec
 - lib/birdsong.rb
+- lib/birdsong/monkeypatch.rb
+- lib/birdsong/scrapers/scraper.rb
+- lib/birdsong/scrapers/tweet_scraper.rb
 - lib/birdsong/tweet.rb
 - lib/birdsong/user.rb
 - lib/birdsong/version.rb