zorki 0.1.27 → 0.1.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c5472c0d436e13f2e8554b59051546fed9400ad793de71b9b2d546bb5bd02d08
4
- data.tar.gz: d62650105cb0f41a48a93d4379e077a4c1b658e96ae13a30c1d8073f8f2e0546
3
+ metadata.gz: 684dbf1fee604da4f2ab9687223bf3aea6f1ebad0a4f9e16ef7c80dafa645ac0
4
+ data.tar.gz: ab628433652062674b97318864663082cf5c7e623d32ff4ec87494d54df79788
5
5
  SHA512:
6
- metadata.gz: 84a98236f4ca36daf440a8aea29acec2fa6963508bae78f5ee7c4d92c2ffedf19ef8db4050deadaa5090ea770132d2a47c64a1bab87f52329bdf18dd31f4aa2e
7
- data.tar.gz: e1b635b352163d08dc0ea9b5e74b3cb990a4f9a7d91ce29296ae2150692612c2a7a81fc9e04bfd33cedfd5c4dab7031e5f06a802a25032aed15036550f306328
6
+ metadata.gz: cfec123b6b4f1abb61c77c963b4162a9ef87273e3c3bc1dc7a67be34a5af5c38dbc70f4a43d3ae5690a3d5655a54d3081545352eb0768b926d598a575c1c3654
7
+ data.tar.gz: d7cc5599d995dcacc5d38dc60669f67f75d6005f282d9013ceb12bdb463b87ddf78eb9a156bf5e52ad0d3f464f1adea89abc2cc0ae371b4cbcbb981b72124593
@@ -40,6 +40,9 @@ module Zorki
40
40
  Capybara.app_host = "https://instagram.com"
41
41
 
42
42
  # video slideshows https://www.instagram.com/p/CY7KxwYOFBS/?utm_source=ig_embed&utm_campaign=loading
43
+ #
44
+ # TODO: Check if post is available publically before trying to login
45
+ # Should help with the scraping
43
46
  login
44
47
  graphql_object = get_content_of_subpage_from_url(
45
48
  "https://www.instagram.com/p/#{id}/",
@@ -47,7 +50,6 @@ module Zorki
47
50
  "data,xdt_api__v1__media__shortcode__web_info,items"
48
51
  )
49
52
 
50
-
51
53
  graphql_object = graphql_object.first if graphql_object.kind_of?(Array)
52
54
 
53
55
  # For pages that have been marked misinfo the structure is very different than not
@@ -149,6 +151,7 @@ module Zorki
149
151
  end
150
152
 
151
153
  # Take the screenshot and return it
154
+ # rubocop:disable Link/Debugger
152
155
  save_screenshot("#{Zorki.temp_storage_location}/instagram_screenshot_#{SecureRandom.uuid}.png")
153
156
  end
154
157
  end
@@ -70,13 +70,10 @@ module Zorki
70
70
  # the one we want, and then moves on.
71
71
  response_body = nil
72
72
 
73
- responses = []
74
73
  page.driver.browser.intercept do |request, &continue|
75
74
  # This passes the request forward unmodified, since we only care about the response
76
- #
77
- # responses.first.post_data.include?("render_surface%22%3A%22PROFILE")
78
75
  continue.call(request) && next unless request.url.include?(subpage_search)
79
- continue.call(request) && next unless !post_data_include.nil? && request.post_data.include?(post_data_include)
76
+ continue.call(request) && next unless !post_data_include.nil? && request.post_data&.include?(post_data_include)
80
77
 
81
78
  continue.call(request) do |response|
82
79
  puts "***********************************************************"
@@ -113,6 +110,10 @@ module Zorki
113
110
  end
114
111
  rescue Selenium::WebDriver::Error::WebDriverError
115
112
  # Eat them
113
+ rescue StandardError => e
114
+ puts "***********************************************************"
115
+ puts "Error in intercept: #{e}"
116
+ puts "***********************************************************"
116
117
  end
117
118
 
118
119
  # Now that the intercept is set up, we visit the page we want
@@ -131,6 +132,7 @@ module Zorki
131
132
  # If this is a page that has not been marked as misinfo we can just pull the data
132
133
  # TODO: put this before the whole load loop
133
134
  if response_body.nil?
135
+
134
136
  doc = Nokogiri::HTML(page.driver.browser.page_source)
135
137
  # elements = doc.search("script").find_all do |e|
136
138
  # e.attributes.has_key?("type") && e.attributes["type"].value == "application/ld+json"
@@ -142,13 +144,13 @@ module Zorki
142
144
  element_json = OJ.load(element.text)
143
145
 
144
146
  # if element.text.include?("jokoy.komi.io")
145
- # debugger
146
- # if element_json["require"].first.last.first["__bbox"].key?("require")
147
+ # debugger
148
+ # if element_json["require"].first.last.first["__bbox"].key?("require")
147
149
 
148
- # element_json["require"].first.last.first["__bbox"]["require"].each do |x|
149
- # debugger if x.to_s.include?("Si mulut pelaut")
150
- # end
151
- # end
150
+ # element_json["require"].first.last.first["__bbox"]["require"].each do |x|
151
+ # debugger if x.to_s.include?("Si mulut pelaut")
152
+ # end
153
+ # end
152
154
  # end
153
155
 
154
156
  parsed_element_json = element_json["require"].last.last.first["__bbox"]["require"].first.last.last["__bbox"]["result"]["data"]["xdt_api__v1__media__shortcode__web_info"]
@@ -24,13 +24,15 @@ module Zorki
24
24
  begin
25
25
  login
26
26
 
27
- graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql/query", "data,user,media_count", post_data_include: "render_surface")
27
+ # This is searching for a specific request, the reason it's weird is because it's uri encoded
28
+ # graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql/query", "data,user,media_count", post_data_include: "render_surface%22%3A%22PROFILE")
29
+ graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql/query", nil, post_data_include: "render_surface%22%3A%22PROFILE")
28
30
  graphql_script = graphql_script.first if graphql_script.class == Array
29
31
 
30
32
  if graphql_script.nil?
31
33
  graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "web_profile_info")
32
34
  end
33
- rescue Zorki::ContentUnavailableError => e
35
+ rescue Zorki::ContentUnavailableError
34
36
  count += 1
35
37
 
36
38
  if count > 3
@@ -100,8 +102,7 @@ module Zorki
100
102
  profile_image_url: profile_image_url
101
103
  }
102
104
  end
103
- rescue Zorki::ContentUnavailableError => e
104
- debugger
105
+ rescue Zorki::ContentUnavailableError
105
106
  raise Zorki::UserScrapingError.new("Zorki could not find user #{username}", additional_data: { username: username })
106
107
  end
107
108
  end
data/lib/zorki/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Zorki
4
- VERSION = "0.1.27"
4
+ VERSION = "0.1.29"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: zorki
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.27
4
+ version: 0.1.29
5
5
  platform: ruby
6
6
  authors:
7
7
  - Christopher Guess
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-10-14 00:00:00.000000000 Z
11
+ date: 2024-10-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: capybara