RubyGems - zorki - Versions diffs - 0.1.26 → 0.1.28 - Mend

zorki 0.1.26 → 0.1.28

Files changed (7) hide show

checksums.yaml +4 -4
data/Gemfile.lock +1 -1
data/lib/zorki/scrapers/post_scraper.rb +4 -0
data/lib/zorki/scrapers/scraper.rb +37 -14
data/lib/zorki/scrapers/user_scraper.rb +9 -7
data/lib/zorki/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: ec1dba1c80d66f33d1e50fc104511a5842091529c6d2b1e3f4758508ae37468a
-  data.tar.gz: b510d48cbc12d0e9b70fb17fb91aeffdad51d85f85e5813695df1c86279894db
+  metadata.gz: 0fb9866c1d2efb0e686e6c0edd4f268c452cc18ed2f2481b46cbc1b8f2c02445
+  data.tar.gz: bafdf519a9b2ed1c5fb2f0711ebbf7bf7909e32769290bfe6286a0463056edc7
 SHA512:
-  metadata.gz: 6463fc59b818e21d4c515212d04549b7a3b925433d7e9df79ff376a786f74e028ab99146b8422c64384470443d88835854717e3a19a7c68bb3a767e7277f1c39
-  data.tar.gz: 173b570bb5eb62d759a488c21aff0092c633db1f0f1a4fa771e46ad32141a2016386f698e724cbc1a7dced5ad66a9d7b22add8f7dd533d8dc88ec009ca9a2814
+  metadata.gz: 13f0bce3dbe9ee6d029f79569a27d287c6679643aa0fcdbc3e176a5667d214664eae046e4f2700aab712f4f3b2e96c5535f3d05c6204fe2856c0101b911be5f6
+  data.tar.gz: 6279ee4bb40c5ad8a6e74be86343027d5b7b122af763274dad96eb3c60d46b30de14acc7f6e57b70b5532888f022b1bcc4db5a8b87d0281471bab519a9faf067

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    zorki (0.1.25)
+    zorki (0.1.26)
       apparition
       capybara
       oj

data/lib/zorki/scrapers/post_scraper.rb CHANGED Viewed

@@ -40,6 +40,9 @@ module Zorki
       Capybara.app_host = "https://instagram.com"
       # video slideshows https://www.instagram.com/p/CY7KxwYOFBS/?utm_source=ig_embed&utm_campaign=loading
+      #
+      # TODO: Check if post is available publically before trying to login
+      # Should help with the scraping
       login
       graphql_object = get_content_of_subpage_from_url(
         "https://www.instagram.com/p/#{id}/",
@@ -149,6 +152,7 @@ module Zorki
       end
       # Take the screenshot and return it
+      # rubocop:disable Link/Debugger
       save_screenshot("#{Zorki.temp_storage_location}/instagram_screenshot_#{SecureRandom.uuid}.png")
     end
   end

data/lib/zorki/scrapers/scraper.rb CHANGED Viewed

@@ -51,8 +51,13 @@ module Zorki
     # additional_search_params is a comma seperated keys
     # example: `data,xdt_api__v1__media__shortcode__web_info,items`
     #
+    # NOTE: `post_data_include` if not nil overrules the additional_search_parameters
+    # This is so that i didn't have to refactor the entire code base when I added it.
+    # Eventually it might be better to look at the post request and see if we can do the
+    # same type of search there as we use for users and simplify this whole thing a lot.
+    #
     # @returns Hash a ruby hash of the JSON data
-    def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil)
+    def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil, post_data_include: nil)
       # So this is fun:
       # For pages marked as misinformation we have to use one method (interception of requrest) and
       # for pages that are not, we can just pull the data straight from the page.
@@ -67,19 +72,25 @@ module Zorki
       page.driver.browser.intercept do |request, &continue|
         # This passes the request forward unmodified, since we only care about the response
-        # puts "checking request: #{request.url}"
+        #
+        # responses.first.post_data.include?("render_surface%22%3A%22PROFILE")
         continue.call(request) && next unless request.url.include?(subpage_search)
+        continue.call(request) && next unless !post_data_include.nil? && request.post_data&.include?(post_data_include)
         continue.call(request) do |response|
+          puts "***********************************************************"
+          puts "checking request: #{request.url}"
+          puts response.body
+          puts "***********************************************************"
+          # responses << response
           # Check if not a CORS prefetch and finish up if not
           if !response.body&.empty? && response.body
             check_passed = true
-            unless additional_search_parameters.nil?
+            if !additional_search_parameters.nil? && post_data_include.nil?
               body_to_check = Oj.load(response.body)
-              debugger if body_to_check.include?("jokoy.komi.io")
               search_parameters = additional_search_parameters.split(",")
               search_parameters.each_with_index do |key, index|
                 break if body_to_check.nil?
@@ -89,11 +100,22 @@ module Zorki
               end
             end
+            if check_passed == false
+              puts "***********************************************************"
+              puts "checking FAILED request: #{request.url}"
+              puts response.body
+              puts "***********************************************************"
+            end
             response_body = response.body if check_passed == true
           end
         end
       rescue Selenium::WebDriver::Error::WebDriverError
         # Eat them
+      rescue StandardError => e
+        puts "***********************************************************"
+        puts "Error in intercept: #{e}"
+        puts "***********************************************************"
       end
       # Now that the intercept is set up, we visit the page we want
@@ -112,6 +134,7 @@ module Zorki
       # If this is a page that has not been marked as misinfo we can just pull the data
       # TODO: put this before the whole load loop
       if response_body.nil?
         doc = Nokogiri::HTML(page.driver.browser.page_source)
         # elements = doc.search("script").find_all do |e|
         #   e.attributes.has_key?("type") && e.attributes["type"].value == "application/ld+json"
@@ -120,17 +143,17 @@ module Zorki
         elements = doc.search("script").filter_map do |element|
           parsed_element_json = nil
           begin
-            element_json = JSON.parse(element.text)
+            element_json = OJ.load(element.text)
-            if element.text.include?("jokoy.komi.io")
-              debugger
-              # if element_json["require"].first.last.first["__bbox"].key?("require")
+            # if element.text.include?("jokoy.komi.io")
+            # debugger
+            # if element_json["require"].first.last.first["__bbox"].key?("require")
-              #   element_json["require"].first.last.first["__bbox"]["require"].each do |x|
-              #     debugger if x.to_s.include?("Si mulut pelaut")
-              #   end
-              # end
-            end
+            #   element_json["require"].first.last.first["__bbox"]["require"].each do |x|
+            #     debugger if x.to_s.include?("Si mulut pelaut")
+            #   end
+            # end
+            # end
             parsed_element_json = element_json["require"].last.last.first["__bbox"]["require"].first.last.last["__bbox"]["result"]["data"]["xdt_api__v1__media__shortcode__web_info"]
           rescue StandardError

data/lib/zorki/scrapers/user_scraper.rb CHANGED Viewed

@@ -20,22 +20,25 @@ module Zorki
       graphql_script = nil
       count = 0
       loop do
-        raise Zorki::UserScrapingError.new("Zorki could not find user #{username}", additional_data: { username: username }) if count > 3
         print "Scraping user #{username}... (attempt #{count + 1})\n"
         begin
           login
-          graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql", "data,user,media_count")
+          graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql/query", "data,user,media_count", post_data_include: "render_surface")
           graphql_script = graphql_script.first if graphql_script.class == Array
           if graphql_script.nil?
             graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "web_profile_info")
           end
-        rescue Zorki::ContentUnavailableError => e
+        rescue Zorki::ContentUnavailableError
           count += 1
+          if count > 3
+            raise Zorki::UserScrapingError.new("Zorki could not find user #{username}", additional_data: { username: username })
+          end
           page.driver.browser.navigate.to("https://www.instagram.com") # we want to go back to the main page so we start from scratch
-          sleep 10
+          sleep rand(5..10)
           next
         end
@@ -97,8 +100,7 @@ module Zorki
           profile_image_url: profile_image_url
         }
       end
-    rescue Zorki::ContentUnavailableError => e
-      debugger
+    rescue Zorki::ContentUnavailableError
       raise Zorki::UserScrapingError.new("Zorki could not find user #{username}", additional_data: { username: username })
     end
   end

data/lib/zorki/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Zorki
-  VERSION = "0.1.26"
+  VERSION = "0.1.28"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: zorki
 version: !ruby/object:Gem::Version
-  version: 0.1.26
+  version: 0.1.28
 platform: ruby
 authors:
 - Christopher Guess
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2024-09-16 00:00:00.000000000 Z
+date: 2024-10-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: capybara