zorki 0.1.27 → 0.1.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/zorki/scrapers/post_scraper.rb +4 -0
- data/lib/zorki/scrapers/scraper.rb +12 -8
- data/lib/zorki/scrapers/user_scraper.rb +2 -3
- data/lib/zorki/version.rb +1 -1
- metadata +2 -2
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA256:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: 0fb9866c1d2efb0e686e6c0edd4f268c452cc18ed2f2481b46cbc1b8f2c02445
         | 
| 4 | 
            +
              data.tar.gz: bafdf519a9b2ed1c5fb2f0711ebbf7bf7909e32769290bfe6286a0463056edc7
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 13f0bce3dbe9ee6d029f79569a27d287c6679643aa0fcdbc3e176a5667d214664eae046e4f2700aab712f4f3b2e96c5535f3d05c6204fe2856c0101b911be5f6
         | 
| 7 | 
            +
              data.tar.gz: 6279ee4bb40c5ad8a6e74be86343027d5b7b122af763274dad96eb3c60d46b30de14acc7f6e57b70b5532888f022b1bcc4db5a8b87d0281471bab519a9faf067
         | 
| @@ -40,6 +40,9 @@ module Zorki | |
| 40 40 | 
             
                  Capybara.app_host = "https://instagram.com"
         | 
| 41 41 |  | 
| 42 42 | 
             
                  # video slideshows https://www.instagram.com/p/CY7KxwYOFBS/?utm_source=ig_embed&utm_campaign=loading
         | 
| 43 | 
            +
                  #
         | 
| 44 | 
            +
                  # TODO: Check if post is available publically before trying to login
         | 
| 45 | 
            +
                  # Should help with the scraping
         | 
| 43 46 | 
             
                  login
         | 
| 44 47 | 
             
                  graphql_object = get_content_of_subpage_from_url(
         | 
| 45 48 | 
             
                    "https://www.instagram.com/p/#{id}/",
         | 
| @@ -149,6 +152,7 @@ module Zorki | |
| 149 152 | 
             
                  end
         | 
| 150 153 |  | 
| 151 154 | 
             
                  # Take the screenshot and return it
         | 
| 155 | 
            +
                  # rubocop:disable Link/Debugger
         | 
| 152 156 | 
             
                  save_screenshot("#{Zorki.temp_storage_location}/instagram_screenshot_#{SecureRandom.uuid}.png")
         | 
| 153 157 | 
             
                end
         | 
| 154 158 | 
             
              end
         | 
| @@ -70,13 +70,12 @@ module Zorki | |
| 70 70 | 
             
                  # the one we want, and then moves on.
         | 
| 71 71 | 
             
                  response_body = nil
         | 
| 72 72 |  | 
| 73 | 
            -
                  responses = []
         | 
| 74 73 | 
             
                  page.driver.browser.intercept do |request, &continue|
         | 
| 75 74 | 
             
                    # This passes the request forward unmodified, since we only care about the response
         | 
| 76 75 | 
             
                    #
         | 
| 77 76 | 
             
                    # responses.first.post_data.include?("render_surface%22%3A%22PROFILE")
         | 
| 78 77 | 
             
                    continue.call(request) && next unless request.url.include?(subpage_search)
         | 
| 79 | 
            -
                    continue.call(request) && next unless !post_data_include.nil? && request.post_data | 
| 78 | 
            +
                    continue.call(request) && next unless !post_data_include.nil? && request.post_data&.include?(post_data_include)
         | 
| 80 79 |  | 
| 81 80 | 
             
                    continue.call(request) do |response|
         | 
| 82 81 | 
             
                      puts "***********************************************************"
         | 
| @@ -113,6 +112,10 @@ module Zorki | |
| 113 112 | 
             
                    end
         | 
| 114 113 | 
             
                  rescue Selenium::WebDriver::Error::WebDriverError
         | 
| 115 114 | 
             
                    # Eat them
         | 
| 115 | 
            +
                  rescue StandardError => e
         | 
| 116 | 
            +
                    puts "***********************************************************"
         | 
| 117 | 
            +
                    puts "Error in intercept: #{e}"
         | 
| 118 | 
            +
                    puts "***********************************************************"
         | 
| 116 119 | 
             
                  end
         | 
| 117 120 |  | 
| 118 121 | 
             
                  # Now that the intercept is set up, we visit the page we want
         | 
| @@ -131,6 +134,7 @@ module Zorki | |
| 131 134 | 
             
                  # If this is a page that has not been marked as misinfo we can just pull the data
         | 
| 132 135 | 
             
                  # TODO: put this before the whole load loop
         | 
| 133 136 | 
             
                  if response_body.nil?
         | 
| 137 | 
            +
             | 
| 134 138 | 
             
                    doc = Nokogiri::HTML(page.driver.browser.page_source)
         | 
| 135 139 | 
             
                    # elements = doc.search("script").find_all do |e|
         | 
| 136 140 | 
             
                    #   e.attributes.has_key?("type") && e.attributes["type"].value == "application/ld+json"
         | 
| @@ -142,13 +146,13 @@ module Zorki | |
| 142 146 | 
             
                        element_json = OJ.load(element.text)
         | 
| 143 147 |  | 
| 144 148 | 
             
                        # if element.text.include?("jokoy.komi.io")
         | 
| 145 | 
            -
             | 
| 146 | 
            -
             | 
| 149 | 
            +
                        # debugger
         | 
| 150 | 
            +
                        # if element_json["require"].first.last.first["__bbox"].key?("require")
         | 
| 147 151 |  | 
| 148 | 
            -
             | 
| 149 | 
            -
             | 
| 150 | 
            -
             | 
| 151 | 
            -
             | 
| 152 | 
            +
                        #   element_json["require"].first.last.first["__bbox"]["require"].each do |x|
         | 
| 153 | 
            +
                        #     debugger if x.to_s.include?("Si mulut pelaut")
         | 
| 154 | 
            +
                        #   end
         | 
| 155 | 
            +
                        # end
         | 
| 152 156 | 
             
                        # end
         | 
| 153 157 |  | 
| 154 158 | 
             
                        parsed_element_json = element_json["require"].last.last.first["__bbox"]["require"].first.last.last["__bbox"]["result"]["data"]["xdt_api__v1__media__shortcode__web_info"]
         | 
| @@ -30,7 +30,7 @@ module Zorki | |
| 30 30 | 
             
                      if graphql_script.nil?
         | 
| 31 31 | 
             
                        graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "web_profile_info")
         | 
| 32 32 | 
             
                      end
         | 
| 33 | 
            -
                    rescue Zorki::ContentUnavailableError | 
| 33 | 
            +
                    rescue Zorki::ContentUnavailableError
         | 
| 34 34 | 
             
                      count += 1
         | 
| 35 35 |  | 
| 36 36 | 
             
                      if count > 3
         | 
| @@ -100,8 +100,7 @@ module Zorki | |
| 100 100 | 
             
                      profile_image_url: profile_image_url
         | 
| 101 101 | 
             
                    }
         | 
| 102 102 | 
             
                  end
         | 
| 103 | 
            -
                rescue Zorki::ContentUnavailableError | 
| 104 | 
            -
                  debugger
         | 
| 103 | 
            +
                rescue Zorki::ContentUnavailableError
         | 
| 105 104 | 
             
                  raise Zorki::UserScrapingError.new("Zorki could not find user #{username}", additional_data: { username: username })
         | 
| 106 105 | 
             
                end
         | 
| 107 106 | 
             
              end
         | 
    
        data/lib/zorki/version.rb
    CHANGED
    
    
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: zorki
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0.1. | 
| 4 | 
            +
              version: 0.1.28
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Christopher Guess
         | 
| 8 8 | 
             
            autorequire:
         | 
| 9 9 | 
             
            bindir: exe
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date: 2024-10- | 
| 11 | 
            +
            date: 2024-10-15 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: capybara
         |