RubyGems - zorki - Versions diffs - 0.1.27 → 0.1.29 - Mend

zorki 0.1.27 → 0.1.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +4 -4
data/lib/zorki/scrapers/post_scraper.rb +4 -1
data/lib/zorki/scrapers/scraper.rb +12 -10
data/lib/zorki/scrapers/user_scraper.rb +5 -4
data/lib/zorki/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: c5472c0d436e13f2e8554b59051546fed9400ad793de71b9b2d546bb5bd02d08
-  data.tar.gz: d62650105cb0f41a48a93d4379e077a4c1b658e96ae13a30c1d8073f8f2e0546
+  metadata.gz: 684dbf1fee604da4f2ab9687223bf3aea6f1ebad0a4f9e16ef7c80dafa645ac0
+  data.tar.gz: ab628433652062674b97318864663082cf5c7e623d32ff4ec87494d54df79788
 SHA512:
-  metadata.gz: 84a98236f4ca36daf440a8aea29acec2fa6963508bae78f5ee7c4d92c2ffedf19ef8db4050deadaa5090ea770132d2a47c64a1bab87f52329bdf18dd31f4aa2e
-  data.tar.gz: e1b635b352163d08dc0ea9b5e74b3cb990a4f9a7d91ce29296ae2150692612c2a7a81fc9e04bfd33cedfd5c4dab7031e5f06a802a25032aed15036550f306328
+  metadata.gz: cfec123b6b4f1abb61c77c963b4162a9ef87273e3c3bc1dc7a67be34a5af5c38dbc70f4a43d3ae5690a3d5655a54d3081545352eb0768b926d598a575c1c3654
+  data.tar.gz: d7cc5599d995dcacc5d38dc60669f67f75d6005f282d9013ceb12bdb463b87ddf78eb9a156bf5e52ad0d3f464f1adea89abc2cc0ae371b4cbcbb981b72124593

data/lib/zorki/scrapers/post_scraper.rb CHANGED Viewed

@@ -40,6 +40,9 @@ module Zorki
       Capybara.app_host = "https://instagram.com"
       # video slideshows https://www.instagram.com/p/CY7KxwYOFBS/?utm_source=ig_embed&utm_campaign=loading
+      #
+      # TODO: Check if post is available publically before trying to login
+      # Should help with the scraping
       login
       graphql_object = get_content_of_subpage_from_url(
         "https://www.instagram.com/p/#{id}/",
@@ -47,7 +50,6 @@ module Zorki
         "data,xdt_api__v1__media__shortcode__web_info,items"
       )
       graphql_object = graphql_object.first if graphql_object.kind_of?(Array)
       # For pages that have been marked misinfo the structure is very different than not
@@ -149,6 +151,7 @@ module Zorki
       end
       # Take the screenshot and return it
+      # rubocop:disable Link/Debugger
       save_screenshot("#{Zorki.temp_storage_location}/instagram_screenshot_#{SecureRandom.uuid}.png")
     end
   end

data/lib/zorki/scrapers/scraper.rb CHANGED Viewed

@@ -70,13 +70,10 @@ module Zorki
       # the one we want, and then moves on.
       response_body = nil
-      responses = []
       page.driver.browser.intercept do |request, &continue|
         # This passes the request forward unmodified, since we only care about the response
-        #
-        # responses.first.post_data.include?("render_surface%22%3A%22PROFILE")
         continue.call(request) && next unless request.url.include?(subpage_search)
-        continue.call(request) && next unless !post_data_include.nil? && request.post_data.include?(post_data_include)
+        continue.call(request) && next unless !post_data_include.nil? && request.post_data&.include?(post_data_include)
         continue.call(request) do |response|
           puts "***********************************************************"
@@ -113,6 +110,10 @@ module Zorki
         end
       rescue Selenium::WebDriver::Error::WebDriverError
         # Eat them
+      rescue StandardError => e
+        puts "***********************************************************"
+        puts "Error in intercept: #{e}"
+        puts "***********************************************************"
       end
       # Now that the intercept is set up, we visit the page we want
@@ -131,6 +132,7 @@ module Zorki
       # If this is a page that has not been marked as misinfo we can just pull the data
       # TODO: put this before the whole load loop
       if response_body.nil?
         doc = Nokogiri::HTML(page.driver.browser.page_source)
         # elements = doc.search("script").find_all do |e|
         #   e.attributes.has_key?("type") && e.attributes["type"].value == "application/ld+json"
@@ -142,13 +144,13 @@ module Zorki
             element_json = OJ.load(element.text)
             # if element.text.include?("jokoy.komi.io")
-              # debugger
-              # if element_json["require"].first.last.first["__bbox"].key?("require")
+            # debugger
+            # if element_json["require"].first.last.first["__bbox"].key?("require")
-              #   element_json["require"].first.last.first["__bbox"]["require"].each do |x|
-              #     debugger if x.to_s.include?("Si mulut pelaut")
-              #   end
-              # end
+            #   element_json["require"].first.last.first["__bbox"]["require"].each do |x|
+            #     debugger if x.to_s.include?("Si mulut pelaut")
+            #   end
+            # end
             # end
             parsed_element_json = element_json["require"].last.last.first["__bbox"]["require"].first.last.last["__bbox"]["result"]["data"]["xdt_api__v1__media__shortcode__web_info"]

data/lib/zorki/scrapers/user_scraper.rb CHANGED Viewed

@@ -24,13 +24,15 @@ module Zorki
         begin
           login
-          graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql/query", "data,user,media_count", post_data_include: "render_surface")
+          # This is searching for a specific request, the reason it's weird is because it's uri encoded
+          # graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql/query", "data,user,media_count", post_data_include: "render_surface%22%3A%22PROFILE")
+          graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql/query", nil, post_data_include: "render_surface%22%3A%22PROFILE")
           graphql_script = graphql_script.first if graphql_script.class == Array
           if graphql_script.nil?
             graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "web_profile_info")
           end
-        rescue Zorki::ContentUnavailableError => e
+        rescue Zorki::ContentUnavailableError
           count += 1
           if count > 3
@@ -100,8 +102,7 @@ module Zorki
           profile_image_url: profile_image_url
         }
       end
-    rescue Zorki::ContentUnavailableError => e
-      debugger
+    rescue Zorki::ContentUnavailableError
       raise Zorki::UserScrapingError.new("Zorki could not find user #{username}", additional_data: { username: username })
     end
   end

data/lib/zorki/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Zorki
-  VERSION = "0.1.27"
+  VERSION = "0.1.29"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: zorki
 version: !ruby/object:Gem::Version
-  version: 0.1.27
+  version: 0.1.29
 platform: ruby
 authors:
 - Christopher Guess
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2024-10-14 00:00:00.000000000 Z
+date: 2024-10-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: capybara