RubyGems - forki - Versions diffs - 0.1.1 - Mend

forki 0.1.1

Files changed (35) hide show

checksums.yaml +7 -0
data/.github/workflows/main.yml +18 -0
data/.gitignore +17 -0
data/.rubocop.yml +71 -0
data/.ruby-version +1 -0
data/CHANGELOG.md +5 -0
data/CODE_OF_CONDUCT.md +84 -0
data/Gemfile +21 -0
data/Gemfile.lock +163 -0
data/LICENSE.txt +21 -0
data/README.md +87 -0
data/Rakefile +16 -0
data/bin/console +15 -0
data/bin/setup +8 -0
data/forki.gemspec +42 -0
data/lib/forki/post.rb +61 -0
data/lib/forki/scrapers/post_scraper.rb +360 -0
data/lib/forki/scrapers/scraper.rb +189 -0
data/lib/forki/scrapers/user_scraper.rb +94 -0
data/lib/forki/user.rb +45 -0
data/lib/forki/version.rb +5 -0
data/lib/forki.rb +98 -0
data/lib/generators/forki.rb +3 -0
data/lib/generators/forki_generator.rb +6 -0
data/lib/helpers/configuration.rb +28 -0
data/reactions/.DS_Store +0 -0
data/reactions/angry.png +0 -0
data/reactions/care.png +0 -0
data/reactions/haha.png +0 -0
data/reactions/like.png +0 -0
data/reactions/love.png +0 -0
data/reactions/pride.png +0 -0
data/reactions/sad.png +0 -0
data/reactions/wow.png +0 -0
metadata +146 -0

data/lib/forki/scrapers/post_scraper.rb ADDED Viewed

@@ -0,0 +1,360 @@
+# frozen_string_literal: true
+require "typhoeus"
+require "securerandom"
+require "byebug"
+module Forki
+  # rubocop:disable Metrics/ClassLength
+  class PostScraper < Scraper
+    # Searches the DOM to finds the number of times a (video) post has been viewed.
+    # Returns nil if it can't find a DOM element with the view count
+    def find_number_of_views
+      views_pattern = /[0-9MK, ]+Views/
+      spans = all("span")
+      views_span = spans.find { |s| s.text(:all) =~ views_pattern }
+      extract_int_from_num_element(views_span)
+    end
+    def extract_post_data(graphql_strings)
+      # Bail out of the post otherwise it gets stuck
+      raise ContentUnavailableError unless is_post_available?
+      graphql_objects = get_graphql_objects(graphql_strings)
+      post_has_video = check_if_post_is_video(graphql_objects)
+      post_has_image = check_if_post_is_image(graphql_objects)
+      # There's a chance it may be embedded in a comment chain like this:
+      # https://www.facebook.com/PlandemicMovie/posts/588866298398729/
+      post_has_video_in_comment_stream = check_if_post_is_in_comment_stream(graphql_objects) if post_has_video == false
+      if post_has_video
+        extract_video_post_data(graphql_strings)
+      elsif post_has_video_in_comment_stream
+        extract_video_comment_post_data(graphql_objects)
+      elsif post_has_image
+        extract_image_post_data(graphql_objects)
+      else
+        raise UnhandledContentError
+      end
+    end
+    def get_graphql_objects(graphql_strings)
+      graphql_strings.map { |graphql_object| JSON.parse(graphql_object) }
+    end
+    def check_if_post_is_video(graphql_objects)
+      graphql_objects.any? { |graphql_object| graphql_object.key?("is_live_streaming") || graphql_object.key?("video") || check_if_post_is_reel(graphql_object) }
+    end
+    def check_if_post_is_reel(graphql_object)
+      return false unless graphql_object.key?("node")
+      begin
+        style_infos = graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["style_infos"].first
+      rescue NoMethodError # if the object doesn't match the attribute chain above, the line above will try to operate on nil
+        return false
+      end
+      style_infos.include?("fb_shorts_story")
+    end
+    def check_if_post_is_image(graphql_objects)
+      graphql_objects.any? do |graphql_object|  # if any GraphQL objects contain the top-level keys above, return true
+        true unless graphql_object.fetch("image", nil).nil? # so long as the associated values are not nil
+        true unless graphql_object.fetch("currMedia", nil).nil?
+      end
+    end
+    def check_if_post_is_in_comment_stream(graphql_objects)
+      graphql_objects.find do |graphql_object|
+        next unless graphql_object.key?("nodes")
+        begin
+          type = graphql_object["nodes"].first["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]["__typename"]
+        rescue StandardError
+          # if there's an error just return false, since the structure is so specific checking the whole thing is a lot
+          next
+        end
+        return true if type == "Video"
+      end
+      false
+    end
+    def is_post_available?
+      begin
+        # This Video Isn't Available Anymore
+        find("span", wait: 5, text: "content isn't available", exact_text: false)
+      rescue Capybara::ElementNotFound, Selenium::WebDriver::Error::StaleElementReferenceError
+        begin
+          find("span", wait: 5, text: "This Video Isn't Available Anymore", exact_text: false)
+        rescue Capybara::ElementNotFound, Selenium::WebDriver::Error::StaleElementReferenceError
+          return true
+        end
+      end
+      false
+    end
+    def extract_video_comment_post_data(graphql_objects)
+      graphql_nodes = nil
+      graphql_objects.find do |graphql_object|
+        next unless graphql_object.key?("nodes")
+        graphql_nodes = graphql_object["nodes"]
+        break
+      end
+      media = graphql_nodes.first["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]
+      inital_feedback_object = graphql_nodes.first["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]
+      feedback_object = inital_feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]
+      post_details = {
+        id: media["id"],
+        num_comments: feedback_object["comment_count"]["total_count"],
+        num_shares: feedback_object["share_count"]["count"],
+        num_views: feedback_object["video_view_count"],
+        reshare_warning: feedback_object["should_show_reshare_warning"],
+        video_preview_image_url: media["preferred_thumbnail"]["image"]["uri"],
+        video_url: media["playable_url_quality_hd"] || media["playable_url"],
+        text: graphql_nodes.first["comet_sections"]["content"]["story"]["comet_sections"]["message"]["story"]["message"]["text"],
+        created_at: media["publish_time"],
+        profile_link: graphql_nodes.first["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"].first["url"],
+        has_video: true
+      }
+      post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
+      post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
+      post_details[:reactions] = inital_feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["i18n_reaction_count"]
+      post_details
+    end
+    # Unfortunately, there's a taxonomy of video post types, all of which require different parsing methods
+    # Specifically, there are normal video posts, video posts from the watch page, and live video posts from the watch page
+    # The general strategy for extracting information from each type, though, is to find which of the 30-odd GraphQL strings are relevant
+    # After finding those GraphQL strings, we parse them into hashes and extract the information we need
+    def extract_video_post_data(graphql_strings)
+      unless all("h1").find { |h1| h1.text.strip == "Watch" }.nil?
+        return extract_video_post_data_from_watch_page(graphql_strings)  # If this is a "watch page" video
+      end
+      graphql_object_array = graphql_strings.map { |graphql_string| JSON.parse(graphql_string) }
+      story_node_object = graphql_object_array.find { |graphql_object| graphql_object.key? "node" }&.fetch("node", nil) # user posted video
+      story_node_object = story_node_object || graphql_object_array.find { |graphql_object| graphql_object.key? "nodes" }&.fetch("nodes")&.first # page posted video
+      return extract_video_post_data_alternative(graphql_object_array) if story_node_object.nil?
+      if story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"].key?("media")
+        video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]
+        creation_date = video_object["publish_time"]
+        # creation_date = video_object["video"]["publish_time"]
+      elsif story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"].key?("style_infos")
+        # For "Reels" we need a separate way to parse this
+        video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["style_infos"].first["fb_shorts_story"]["short_form_video_context"]["playback_video"]
+        creation_date = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["style_infos"].first["fb_shorts_story"]["creation_time"]
+      else
+        raise "Unable to parse video object"
+      end
+      feedback_object = story_node_object["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]
+      reaction_counts = extract_reaction_counts(feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["cannot_see_top_custom_reactions"]["top_reactions"])
+      share_count_object = feedback_object.fetch("share_count", {})
+      if story_node_object["comet_sections"]["content"]["story"]["comet_sections"].key? "message"
+        text = story_node_object["comet_sections"]["content"]["story"]["comet_sections"]["message"]["story"]["message"]["text"]
+      else
+        text = ""
+      end
+      feedback_object["comment_list_renderer"]["feedback"]["comment_count"]["total_count"]
+      num_comments = feedback_object.has_key?("comment_list_renderer") ? feedback_object["comment_list_renderer"]["feedback"]["comment_count"]["total_count"] : feedback_object["comment_count"]["total_count"]
+      post_details = {
+        id: video_object["id"],
+        num_comments: num_comments,
+        num_shares: share_count_object.fetch("count", nil),
+        num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
+        reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
+        video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
+        video_url: video_object["playable_url_quality_hd"] || video_object["playable_url"],
+        text: text,
+        created_at: creation_date,
+        profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
+        has_video: true
+      }
+      post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
+      post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
+      post_details[:reactions] = reaction_counts
+      post_details
+    end
+    def extract_video_post_data_alternative(graphql_object_array)
+      sidepane_object = graphql_object_array.find { |graphql_object| graphql_object.key?("tahoe_sidepane_renderer") }
+      video_object = graphql_object_array.find { |graphql_object| graphql_object.keys == ["video"] }
+      feedback_object = sidepane_object["tahoe_sidepane_renderer"]["video"]["feedback"]
+      reaction_counts = extract_reaction_counts(sidepane_object["tahoe_sidepane_renderer"]["video"]["feedback"]["cannot_see_top_custom_reactions"]["top_reactions"])
+      share_count_object = feedback_object.fetch("share_count", {})
+      post_details = {
+        id: video_object["id"],
+        num_comments: feedback_object["comments_count_summary_renderer"]["feedback"]["total_comment_count"],
+        num_shares: share_count_object.fetch("count", nil),
+        num_views: feedback_object["video_view_count"],
+        reshare_warning: feedback_object["should_show_reshare_warning"],
+        video_preview_image_url: video_object["video"]["preferred_thumbnail"]["image"]["uri"],
+        video_url: video_object["video"]["playable_url_quality_hd"] || video_object["video"]["playable_url"],
+        text: sidepane_object["tahoe_sidepane_renderer"]["video"]["creation_story"]["comet_sections"]["message"]["story"]["message"]["text"],
+        created_at: video_object["video"]["publish_time"],
+        profile_link: sidepane_object["tahoe_sidepane_renderer"]["video"]["creation_story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
+        has_video: true
+      }
+      post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
+      post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
+      post_details[:reactions] = reaction_counts
+      post_details
+    end
+    # Extracts data from an image post by parsing GraphQL strings as seen in the video post scraper above
+    def extract_image_post_data(graphql_object_array)
+      graphql_object_array.find { |graphql_object| graphql_object.key?("viewer_actor") && graphql_object.key?("display_comments") }
+      curr_media_object = graphql_object_array.find { |graphql_object| graphql_object.key?("currMedia") }
+      creation_story_object = graphql_object_array.find { |graphql_object| graphql_object.key?("creation_story") && graphql_object.key?("message") }
+      feedback_object = graphql_object_array.find { |graphql_object| graphql_object.has_key?("comet_ufi_summary_and_actions_renderer") }["comet_ufi_summary_and_actions_renderer"]["feedback"]
+      share_count_object = feedback_object.fetch("share_count", {})
+      poster = creation_story_object["creation_story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]
+      reaction_counts = extract_reaction_counts(feedback_object["cannot_see_top_custom_reactions"]["top_reactions"])
+      post_details = {
+        id: curr_media_object["currMedia"]["id"],
+        num_comments: feedback_object["comments_count_summary_renderer"]["feedback"]["total_comment_count"],
+        num_shares: share_count_object.fetch("count", nil),
+        reshare_warning: feedback_object["should_show_reshare_warning"],
+        image_url: curr_media_object["currMedia"]["image"]["uri"],
+        text: (creation_story_object["message"] || {}).fetch("text", nil),
+        profile_link: poster["url"],
+        created_at: curr_media_object["currMedia"]["created_time"],
+        has_video: false
+      }
+      post_details[:image_file] = Forki.retrieve_media(post_details[:image_url])
+      post_details[:reactions] = reaction_counts
+      post_details
+    end
+    # Extract data from a non-live video post on the watch page
+    def extract_video_post_data_from_watch_page(graphql_strings)
+      return extract_live_video_post_data_from_watch_page(graphql_strings) if current_url.include?("live")
+      video_object = graphql_strings.map { |g| JSON.parse(g) }.find { |x| x.key?("video") }
+      creation_story_object = JSON.parse(graphql_strings.find { |graphql_string| (graphql_string.include?("creation_story")) && \
+                                                            (graphql_string.include?("live_status")) })
+      video_permalink = creation_story_object["creation_story"]["shareable"]["url"].delete("\\")
+      media_object = video_object["video"]["story"]["attachments"][0]["media"]
+      reaction_counts = extract_reaction_counts(creation_story_object["feedback"]["cannot_see_top_custom_reactions"]["top_reactions"])
+      post_details = {
+        id: video_object["id"],
+        num_comments: creation_story_object["feedback"]["total_comment_count"],
+        num_shares: nil, # Not present for watch feed videos?
+        num_views: creation_story_object["feedback"]["video_view_count_renderer"]["feedback"]["video_view_count"],
+        reshare_warning: creation_story_object["feedback"]["should_show_reshare_warning"],
+        video_preview_image_url: video_object["video"]["story"]["attachments"][0]["media"]["preferred_thumbnail"]["image"]["uri"],
+        video_url: (media_object.fetch("playable_url_quality_hd", nil) || media_object.fetch("playable_url", nil)).delete("\\"),
+        text: (creation_story_object["creation_story"]["message"] || {})["text"],
+        created_at: video_object["video"]["story"]["attachments"][0]["media"]["publish_time"],
+        profile_link: video_permalink[..video_permalink.index("/videos")],
+        has_video: true
+      }
+      post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
+      post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
+      post_details[:reactions] = reaction_counts
+      post_details
+    end
+    # Extract data from live video post on the watch page
+    def extract_live_video_post_data_from_watch_page(graphql_strings)
+      creation_story_object = JSON.parse(graphql_strings.find { |graphql| (graphql.include? "comment_count") && \
+                                                       (graphql.include? "creation_story") })["video"]["creation_story"]
+      media_object = JSON.parse(graphql_strings.find { |graphql| graphql.include? "playable_url" })["video"]["creation_story"]["attachments"][0]["media"]
+      video_permalink = creation_story_object["shareable"]["url"].delete("\\")
+      reaction_counts = extract_reaction_counts(creation_story_object["feedback_context"]["feedback_target_with_context"]["cannot_see_top_custom_reactions"]["top_reactions"])
+      post_details = {
+        id: creation_story_object["shareable"]["id"],
+        num_comments: creation_story_object["feedback_context"]["feedback_target_with_context"]["total_comment_count"],
+        num_shares: nil,
+        num_views: find_number_of_views, # as far as I can tell, this is never present for live videos
+        reshare_warning: creation_story_object["feedback_context"]["feedback_target_with_context"]["should_show_reshare_warning"],
+        video_preview_image_url: creation_story_object["attachments"][0]["media"]["preferred_thumbnail"]["image"]["uri"],
+        video_url: (media_object.fetch("playable_url_quality_hd", nil) || media_object.fetch("playable_url", nil)).delete("\\"),
+        text: creation_story_object["attachments"][0]["media"]["savable_description"]["text"],
+        created_at: creation_story_object["attachments"][0]["media"]["publish_time"],
+        profile_link: video_permalink[..video_permalink.index("/videos")],
+        has_video: true
+      }
+      post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
+      post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
+      post_details[:reactions] = reaction_counts
+      post_details
+    end
+    # Returns a hash containing counts of each reaction to a post
+    # Takes the edges list and creates a dictionary for each element that looks like: {:num_likes: 1234}
+    # Then merges the dictionaries with the inject call
+    def extract_reaction_counts(reactions_object)
+      reactions_object["edges"].map do |reaction|
+        {
+          "num_#{reaction["node"]["localized_name"].downcase}s".to_sym => reaction["reaction_count"]
+        }
+      end.inject { |emoji_counts, count| emoji_counts.merge(count) }
+    end
+    def take_screenshot
+      # First check whether post being scraped has a fact check overlay. If it does clear it.
+      begin
+        find('div[aria-label=" See Photo "]').click() || find('div[aria-label=" See Video "]').click()
+      rescue Capybara::ElementNotFound
+        # Do nothing if element not found
+      end
+      save_screenshot("#{Forki.temp_storage_location}/facebook_screenshot_#{SecureRandom.uuid}.png")
+    end
+    # Uses GraphQL data and DOM elements to collect information about the current post
+    def parse(url)
+      validate_and_load_page(url)
+      graphql_strings = find_graphql_data_strings(page.html)
+      post_data = extract_post_data(graphql_strings)
+      post_data[:url] = url
+      user_url = post_data[:profile_link]
+      5.times do
+        begin
+          post_data[:screenshot_file] = take_screenshot
+          break
+        rescue Net::ReadTimeout; end
+        sleep(5)
+      end
+      # page.quit # Close browser between page navigations to prevent cache folder access issues
+      post_data[:user] = User.lookup(user_url).first
+      page.quit
+      post_data
+    rescue Net::ReadTimeout
+      # Eat it?
+    rescue StandardError => e
+      raise e
+    ensure
+      page.quit
+    end
+  end
+end

data/lib/forki/scrapers/scraper.rb ADDED Viewed

@@ -0,0 +1,189 @@
+# frozen_string_literal: true
+# require_relative "user_scraper"
+require "capybara/dsl"
+require "dotenv/load"
+require "oj"
+require "selenium-webdriver"
+require "open-uri"
+options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
+options.add_argument("--start-maximized")
+options.add_argument("--no-sandbox")
+options.add_argument("--disable-dev-shm-usage")
+options.add_argument("–-disable-blink-features=AutomationControlled")
+options.add_argument("--disable-extensions")
+options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
+options.add_preference "password_manager_enabled", false
+options.add_argument("--disable-dev-shm-usage")
+options.add_argument("--remote-debugging-port=9222")
+options.add_argument("--user-data-dir=/tmp/tarun_forki_#{SecureRandom.uuid}")
+Capybara.register_driver :selenium_forki do |app|
+  client = Selenium::WebDriver::Remote::Http::Default.new
+  client.read_timeout = 60  # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
+  Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
+end
+Capybara.default_max_wait_time = 60
+Capybara.threadsafe = true
+Capybara.reuse_server = true
+module Forki
+  class Scraper # rubocop:disable Metrics/ClassLength
+    include Capybara::DSL
+    def initialize
+      Capybara.default_driver = :selenium_forki
+      Forki.set_logger_level
+      # reset_selenium
+    end
+    # Yeah, just use the tmp/ directory that's created during setup
+    def download_image(img_elem)
+      img_data = URI.open(img_elem["src"]).read
+      File.binwrite("temp/emoji.png", img_data)
+    end
+    # Returns all GraphQL data objects embedded within a string
+    # Finds substrings that look like '"data": {...}' and converts them to hashes
+    def find_graphql_data_strings(objs = [], html_str)
+      data_marker = '"data":{'
+      data_start_index = html_str.index(data_marker)
+      return objs if data_start_index.nil? # No more data blocks in the page source
+      data_closure_index = find_graphql_data_closure_index(html_str, data_start_index)
+      return objs if data_closure_index.nil?
+      graphql_data_str = html_str[data_start_index...data_closure_index].delete_prefix('"data":')
+      objs + [graphql_data_str] + find_graphql_data_strings(html_str[data_closure_index..])
+    end
+    def find_graphql_data_closure_index(html_str, start_index)
+      closure_index = start_index + 8 # length of data marker. Begin search right after open brace
+      raise "Malformed graphql data object: no closing bracket found" if closure_index > html_str.length
+      brace_stack = 1
+      loop do  # search for brace characters in substring instead of iterating through each char
+        if html_str[closure_index] == "{"
+          brace_stack += 1
+        elsif html_str[closure_index] == "}"
+          brace_stack -= 1
+        end
+        closure_index += 1
+        break if brace_stack.zero?
+      end
+      closure_index
+    end
+  private
+    ##########
+    # Set the session to use a new user folder in the options!
+    # #####################
+    def reset_selenium
+      options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
+      options.add_argument("--start-maximized")
+      options.add_argument("--no-sandbox")
+      options.add_argument("--disable-dev-shm-usage")
+      options.add_argument("–-disable-blink-features=AutomationControlled")
+      options.add_argument("--disable-extensions")
+      options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
+      options.add_preference "password_manager_enabled", false
+      options.add_argument("--disable-dev-shm-usage")
+      options.add_argument("--remote-debugging-port=9222")
+      options.add_argument("--user-data-dir=/tmp/tarun_forki_#{SecureRandom.uuid}")
+      Capybara.register_driver :selenium_forki do |app|
+        client = Selenium::WebDriver::Remote::Http::Default.new
+        client.read_timeout = 60  # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
+        Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
+      end
+      Capybara.current_driver = :selenium_forki
+    end
+    # Logs in to Facebook (if not already logged in)
+    def login(url = nil)
+      raise MissingCredentialsError if ENV["FACEBOOK_EMAIL"].nil? || ENV["FACEBOOK_PASSWORD"].nil?
+      url ||= "https://www.facebook.com"
+      visit(url)  # Visit the url passed in or the facebook homepage if nothing is
+      # Look for "login_form" box, which throws an error if not found. So we catch it and run the rest of the tests
+      begin
+        login_form = first(id: "login_form", wait: 5)
+      rescue Capybara::ElementNotFound
+        return unless page.title.downcase.include?("facebook - log in")
+      end
+      # Since we're not logged in, let's do that quick
+      visit("https://www.facebook.com") if login_form.nil?
+      login_form.fill_in("email", with: ENV["FACEBOOK_EMAIL"])
+      login_form.fill_in("pass", with: ENV["FACEBOOK_PASSWORD"])
+      # This is a pain because some pages just `click_button` would work, but some won't
+      login_buttons = login_form.all("div", text: "Log In", wait: 5)
+      if login_buttons.empty?
+        login_form.click_button("Log In")
+      else
+        login_buttons.each do |button|
+          if button.text == "Log In"
+            button.click
+            break
+          end
+        end
+      end
+      begin
+        raise Forki::BlockedCredentialsError if find_by_id("error_box", wait: 3)
+      rescue Capybara::ElementNotFound; end
+      # Now we wait awhile, hopefully to slow down scraping
+      sleep(rand * 10.3)
+    end
+    # Ensures that a valid Facebook url has been provided, and that it points to an available post
+    # If either of those two conditions are false, raises an exception
+    def validate_and_load_page(url)
+      Capybara.app_host = "https://www.facebook.com"
+      facebook_url = "https://www.facebook.com"
+      # visit "https://www.facebook.com" unless current_url.start_with?(facebook_url)
+      login(url)
+      raise Forki::InvalidUrlError unless url.start_with?(facebook_url)
+      visit url unless current_url.start_with?(url)
+    end
+    # Extracts an integer out of a string describing a number
+    # e.g. "4K Comments" returns 4000
+    # e.g. "131 Shares" returns 131
+    def extract_int_from_num_element(element)
+      return unless element
+      if element.class != String # if an html element was passed in
+        element = element.text(:all)
+      end
+      num_pattern = /[0-9KM ,.]+/
+      interaction_num_text = num_pattern.match(element)[0]
+      if interaction_num_text.include?(".")  # e.g. "2.2K"
+        interaction_num_text.to_i + interaction_num_text[-2].to_i * 100
+      elsif interaction_num_text.include?("K") # e.g. "13K"
+        interaction_num_text.to_i * 1000
+      elsif interaction_num_text.include?("M") # e.g. "13M"
+        interaction_num_text.to_i * 1_000_000
+      else  # e.g. "15,443"
+        interaction_num_text.delete!(",")
+        interaction_num_text.delete(" ").to_i
+      end
+    end
+  end
+end
+require_relative "post_scraper"
+require_relative "user_scraper"

data/lib/forki/scrapers/user_scraper.rb ADDED Viewed

@@ -0,0 +1,94 @@
+require "typhoeus"
+module Forki
+  class UserScraper < Scraper
+    # Finds and returns the number of people who like the current page
+    def find_number_of_likes
+      likes_pattern = /[0-9,.KM ] people like this/
+      number_of_likes_elem = all("span").filter { | span| likes_pattern.match? span.text }.first
+      extract_int_from_num_element(number_of_likes_elem)
+    end
+    # Finds and returns the number of people who follow the current page
+    def find_number_of_followers(profile_details_string)
+      followers_pattern = /Followed by (?<num_followers>[0-9,.KM ]) people/
+      alt_follower_pattern = /(?<num_followers>[0-9,.KM ]+) (f|F)ollowers/
+      number_of_followers_match = followers_pattern.match(profile_details_string) || alt_follower_pattern.match(profile_details_string)
+      return nil if number_of_followers_match.nil?
+      extract_int_from_num_element(number_of_followers_match.named_captures["num_followers"])
+    end
+    def find_number_followers_for_normal_profile(profile_followers_node)
+      followers_string = profile_followers_node["node"]["timeline_context_item"]["renderer"]["context_item"]["title"]["text"]
+      followers_pattern = /[0-9,]+/
+      number_of_followers_match = followers_pattern.match(followers_string).to_s
+      extract_int_from_num_element(number_of_followers_match)
+    end
+    # Returns a hash of details about a Facebook user profile
+    def extract_profile_details(graphql_strings)
+      profile_header_str = graphql_strings.find { |gql| gql.include? "profile_header_renderer" }
+      profile_intro_str = graphql_strings.find { |g| g.include? "profile_intro_card" }
+      profile_header_obj = JSON.parse(profile_header_str)["user"]["profile_header_renderer"]
+      profile_intro_obj = profile_intro_str ? JSON.parse(profile_intro_str) : nil
+      number_of_followers = find_number_of_followers(profile_header_str)
+      # Check if the user shows followers count
+      if number_of_followers.nil?
+        profile_title_section = graphql_strings.find { |gql| gql.include? "profile_tile_section_type" }
+        json = JSON.parse(profile_title_section)
+        followers_node = json["user"]["profile_tile_sections"]["edges"].first["node"]["profile_tile_views"]["nodes"][1]["view_style_renderer"]["view"]["profile_tile_items"]["nodes"].select do |node|
+          node["node"]["timeline_context_item"]["timeline_context_list_item_type"] == "INTRO_CARD_FOLLOWERS"
+        end
+        if followers_node.empty?
+          number_of_followers = nil
+        else
+          number_of_followers = find_number_followers_for_normal_profile(followers_node.first)
+        end
+      end
+      {
+        id: profile_header_obj["user"]["id"],
+        number_of_followers: number_of_followers,
+        name: profile_header_obj["user"]["name"],
+        verified: profile_header_obj["user"]["is_verified"],
+        profile: profile_intro_obj ? profile_intro_obj["profile_intro_card"]["bio"]["text"] : "",
+        profile_image_url: profile_header_obj["user"]["profilePicLarge"]["uri"],
+      }
+    end
+    # Returns a hash of details about a Facebook page
+    def extract_page_details(graphql_strings)
+      page_cards_string = graphql_strings.find { |graphql_string| (graphql_string.include? "comet_page_cards") && \
+                                                                  (graphql_string.include? "follower_count")}
+      page_cards_list = JSON.parse(page_cards_string)["page"]["comet_page_cards"]
+      page_about_card = page_cards_list.find { |card| card["__typename"] == "CometPageAboutCardWithoutMapRenderer" }
+      viewer_page_object = JSON.parse(graphql_strings.find { |graphql_string| (graphql_string.include? "profile_photo") && \
+                                                                               graphql_string.include?("is_verified") })
+      {
+        id: page_about_card["page"]["id"],
+        profile: page_about_card["page"]["page_about_fields"]["blurb"],
+        number_of_followers: page_about_card["page"]["follower_count"],
+        name: page_about_card["page"]["name"],
+        verified: viewer_page_object["page"]["is_verified"],
+        profile_image_url: viewer_page_object["page"]["profile_picture"]["uri"],
+        number_of_likes: page_about_card["page"]["page_likers"]["global_likers_count"],
+      }
+    end
+    # Uses GraphQL data and DOM elements to collect information about the current user page
+    def parse(url)
+      validate_and_load_page(url)
+      graphql_strings = find_graphql_data_strings(page.html)
+      is_page = graphql_strings.map { |s| JSON.parse(s) }.any? { |o| o.key?("page") }
+      user_details = is_page ? extract_page_details(graphql_strings) : extract_profile_details(graphql_strings)
+      user_details[:profile_image_file] = Forki.retrieve_media(user_details[:profile_image_url])
+      user_details[:profile_link] = url
+      user_details
+    end
+  end
+end