zorki 0.1.26 → 0.1.28

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ec1dba1c80d66f33d1e50fc104511a5842091529c6d2b1e3f4758508ae37468a
4
- data.tar.gz: b510d48cbc12d0e9b70fb17fb91aeffdad51d85f85e5813695df1c86279894db
3
+ metadata.gz: 0fb9866c1d2efb0e686e6c0edd4f268c452cc18ed2f2481b46cbc1b8f2c02445
4
+ data.tar.gz: bafdf519a9b2ed1c5fb2f0711ebbf7bf7909e32769290bfe6286a0463056edc7
5
5
  SHA512:
6
- metadata.gz: 6463fc59b818e21d4c515212d04549b7a3b925433d7e9df79ff376a786f74e028ab99146b8422c64384470443d88835854717e3a19a7c68bb3a767e7277f1c39
7
- data.tar.gz: 173b570bb5eb62d759a488c21aff0092c633db1f0f1a4fa771e46ad32141a2016386f698e724cbc1a7dced5ad66a9d7b22add8f7dd533d8dc88ec009ca9a2814
6
+ metadata.gz: 13f0bce3dbe9ee6d029f79569a27d287c6679643aa0fcdbc3e176a5667d214664eae046e4f2700aab712f4f3b2e96c5535f3d05c6204fe2856c0101b911be5f6
7
+ data.tar.gz: 6279ee4bb40c5ad8a6e74be86343027d5b7b122af763274dad96eb3c60d46b30de14acc7f6e57b70b5532888f022b1bcc4db5a8b87d0281471bab519a9faf067
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- zorki (0.1.25)
4
+ zorki (0.1.26)
5
5
  apparition
6
6
  capybara
7
7
  oj
@@ -40,6 +40,9 @@ module Zorki
40
40
  Capybara.app_host = "https://instagram.com"
41
41
 
42
42
  # video slideshows https://www.instagram.com/p/CY7KxwYOFBS/?utm_source=ig_embed&utm_campaign=loading
43
+ #
44
+ # TODO: Check if post is available publically before trying to login
45
+ # Should help with the scraping
43
46
  login
44
47
  graphql_object = get_content_of_subpage_from_url(
45
48
  "https://www.instagram.com/p/#{id}/",
@@ -149,6 +152,7 @@ module Zorki
149
152
  end
150
153
 
151
154
  # Take the screenshot and return it
155
+ # rubocop:disable Link/Debugger
152
156
  save_screenshot("#{Zorki.temp_storage_location}/instagram_screenshot_#{SecureRandom.uuid}.png")
153
157
  end
154
158
  end
@@ -51,8 +51,13 @@ module Zorki
51
51
  # additional_search_params is a comma seperated keys
52
52
  # example: `data,xdt_api__v1__media__shortcode__web_info,items`
53
53
  #
54
+ # NOTE: `post_data_include` if not nil overrules the additional_search_parameters
55
+ # This is so that i didn't have to refactor the entire code base when I added it.
56
+ # Eventually it might be better to look at the post request and see if we can do the
57
+ # same type of search there as we use for users and simplify this whole thing a lot.
58
+ #
54
59
  # @returns Hash a ruby hash of the JSON data
55
- def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil)
60
+ def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil, post_data_include: nil)
56
61
  # So this is fun:
57
62
  # For pages marked as misinformation we have to use one method (interception of requrest) and
58
63
  # for pages that are not, we can just pull the data straight from the page.
@@ -67,19 +72,25 @@ module Zorki
67
72
 
68
73
  page.driver.browser.intercept do |request, &continue|
69
74
  # This passes the request forward unmodified, since we only care about the response
70
- # puts "checking request: #{request.url}"
75
+ #
76
+ # responses.first.post_data.include?("render_surface%22%3A%22PROFILE")
71
77
  continue.call(request) && next unless request.url.include?(subpage_search)
78
+ continue.call(request) && next unless !post_data_include.nil? && request.post_data&.include?(post_data_include)
72
79
 
73
80
  continue.call(request) do |response|
81
+ puts "***********************************************************"
82
+ puts "checking request: #{request.url}"
83
+ puts response.body
84
+ puts "***********************************************************"
85
+
86
+ # responses << response
74
87
  # Check if not a CORS prefetch and finish up if not
75
88
  if !response.body&.empty? && response.body
76
89
  check_passed = true
77
90
 
78
- unless additional_search_parameters.nil?
91
+ if !additional_search_parameters.nil? && post_data_include.nil?
79
92
  body_to_check = Oj.load(response.body)
80
93
 
81
- debugger if body_to_check.include?("jokoy.komi.io")
82
-
83
94
  search_parameters = additional_search_parameters.split(",")
84
95
  search_parameters.each_with_index do |key, index|
85
96
  break if body_to_check.nil?
@@ -89,11 +100,22 @@ module Zorki
89
100
  end
90
101
  end
91
102
 
103
+ if check_passed == false
104
+ puts "***********************************************************"
105
+ puts "checking FAILED request: #{request.url}"
106
+ puts response.body
107
+ puts "***********************************************************"
108
+ end
109
+
92
110
  response_body = response.body if check_passed == true
93
111
  end
94
112
  end
95
113
  rescue Selenium::WebDriver::Error::WebDriverError
96
114
  # Eat them
115
+ rescue StandardError => e
116
+ puts "***********************************************************"
117
+ puts "Error in intercept: #{e}"
118
+ puts "***********************************************************"
97
119
  end
98
120
 
99
121
  # Now that the intercept is set up, we visit the page we want
@@ -112,6 +134,7 @@ module Zorki
112
134
  # If this is a page that has not been marked as misinfo we can just pull the data
113
135
  # TODO: put this before the whole load loop
114
136
  if response_body.nil?
137
+
115
138
  doc = Nokogiri::HTML(page.driver.browser.page_source)
116
139
  # elements = doc.search("script").find_all do |e|
117
140
  # e.attributes.has_key?("type") && e.attributes["type"].value == "application/ld+json"
@@ -120,17 +143,17 @@ module Zorki
120
143
  elements = doc.search("script").filter_map do |element|
121
144
  parsed_element_json = nil
122
145
  begin
123
- element_json = JSON.parse(element.text)
146
+ element_json = OJ.load(element.text)
124
147
 
125
- if element.text.include?("jokoy.komi.io")
126
- debugger
127
- # if element_json["require"].first.last.first["__bbox"].key?("require")
148
+ # if element.text.include?("jokoy.komi.io")
149
+ # debugger
150
+ # if element_json["require"].first.last.first["__bbox"].key?("require")
128
151
 
129
- # element_json["require"].first.last.first["__bbox"]["require"].each do |x|
130
- # debugger if x.to_s.include?("Si mulut pelaut")
131
- # end
132
- # end
133
- end
152
+ # element_json["require"].first.last.first["__bbox"]["require"].each do |x|
153
+ # debugger if x.to_s.include?("Si mulut pelaut")
154
+ # end
155
+ # end
156
+ # end
134
157
 
135
158
  parsed_element_json = element_json["require"].last.last.first["__bbox"]["require"].first.last.last["__bbox"]["result"]["data"]["xdt_api__v1__media__shortcode__web_info"]
136
159
  rescue StandardError
@@ -20,22 +20,25 @@ module Zorki
20
20
  graphql_script = nil
21
21
  count = 0
22
22
  loop do
23
- raise Zorki::UserScrapingError.new("Zorki could not find user #{username}", additional_data: { username: username }) if count > 3
24
-
25
23
  print "Scraping user #{username}... (attempt #{count + 1})\n"
26
24
  begin
27
25
  login
28
26
 
29
- graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql", "data,user,media_count")
27
+ graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql/query", "data,user,media_count", post_data_include: "render_surface")
30
28
  graphql_script = graphql_script.first if graphql_script.class == Array
31
29
 
32
30
  if graphql_script.nil?
33
31
  graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "web_profile_info")
34
32
  end
35
- rescue Zorki::ContentUnavailableError => e
33
+ rescue Zorki::ContentUnavailableError
36
34
  count += 1
35
+
36
+ if count > 3
37
+ raise Zorki::UserScrapingError.new("Zorki could not find user #{username}", additional_data: { username: username })
38
+ end
39
+
37
40
  page.driver.browser.navigate.to("https://www.instagram.com") # we want to go back to the main page so we start from scratch
38
- sleep 10
41
+ sleep rand(5..10)
39
42
  next
40
43
  end
41
44
 
@@ -97,8 +100,7 @@ module Zorki
97
100
  profile_image_url: profile_image_url
98
101
  }
99
102
  end
100
- rescue Zorki::ContentUnavailableError => e
101
- debugger
103
+ rescue Zorki::ContentUnavailableError
102
104
  raise Zorki::UserScrapingError.new("Zorki could not find user #{username}", additional_data: { username: username })
103
105
  end
104
106
  end
data/lib/zorki/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Zorki
4
- VERSION = "0.1.26"
4
+ VERSION = "0.1.28"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: zorki
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.26
4
+ version: 0.1.28
5
5
  platform: ruby
6
6
  authors:
7
7
  - Christopher Guess
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-09-16 00:00:00.000000000 Z
11
+ date: 2024-10-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: capybara