zorki 0.2.5 → 0.2.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5d6f47b685aa9f49c602bc0082e7ac9084e0cb444acfa6a7989c5d5cd3d572e7
4
- data.tar.gz: 858f01c64e0efa666941222d0fade285f0c49843957cdbc0adc0dbe46e7c7ec0
3
+ metadata.gz: 19f084d94393dd8e69f559d7f9bc84385d80b1ac2730c0e957f031d5e67da38f
4
+ data.tar.gz: f15fbb5c622bcd2940a1adda2748178d1314ac579e31b94d859979c6e507df76
5
5
  SHA512:
6
- metadata.gz: 73a9ac083a40f2e7c8b03a315bd50b83545507ed9e1700d944fa8e1e0e781ce8cad6ff5a17e277d83bd1788cd7444774c5ae530be8c3205213c7dfcbfb27c9c4
7
- data.tar.gz: 201bc5e5eab638249bd4fb1e5dd859ed308e1e6fdbc4438d709144aa519bcef393404a8206929636944f65217af7e8597a6b8589f1a86f16ea3c6513e5806487
6
+ metadata.gz: ea049add265fd88524d894995f5cc3eabdf98c6b9610969ff7344dc86ce2e679ac0f2d6a5e9f606b2111806789d6a783c23dff9596f8a7dc66be8543eb8a3453
7
+ data.tar.gz: e6d9760c423d59280cee47262d2d92e50312c0b858ee9de2dbf4aafa32ee1cb5d7634b3a55ff2b36b5fadb2be8cc52f3b63190239a61c2f1b03116bd3a83c75f
@@ -171,7 +171,7 @@ module Zorki
171
171
  end
172
172
  elsif object.has_key?("display_resources")
173
173
  if object["is_video"] == true
174
- video = Zorki.retrieve_media(object["display_resources"].last["src"])
174
+ video = Zorki.retrieve_media(object["video_url"])
175
175
  video_preview_image = Zorki.retrieve_media(object["display_url"])
176
176
  else
177
177
  images << Zorki.retrieve_media(object["display_resources"].last["src"])
@@ -57,7 +57,7 @@ module Zorki
57
57
  # same type of search there as we use for users and simplify this whole thing a lot.
58
58
  #
59
59
  # @returns Hash a ruby hash of the JSON data
60
- def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil, post_data_include: nil)
60
+ def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil, post_data_include: nil, header: nil)
61
61
  # So this is fun:
62
62
  # For pages marked as misinformation we have to use one method (interception of requrest) and
63
63
  # for pages that are not, we can just pull the data straight from the page.
@@ -73,25 +73,22 @@ module Zorki
73
73
  page.driver.browser.intercept do |request, &continue|
74
74
  # This passes the request forward unmodified, since we only care about the response
75
75
  continue.call(request) && next unless request.url.include?(subpage_search)
76
- continue.call(request) && next unless !post_data_include.nil? && request.post_data&.include?(post_data_include)
76
+ if !header.nil?
77
+ header_key = header.keys.first.to_s
78
+ header_value = header.values.first
79
+
80
+ puts "Request Header included? #{request.headers.include?(header_key)} #{request.headers[header_key]} == #{header_value}"
81
+ continue.call(request) && next unless request.headers.include?(header_key) && request.headers[header_key] == header_value
82
+
83
+ elsif !post_data_include.nil?
84
+ continue.call(request) && next unless request.post_data&.include?(post_data_include)
85
+ end
77
86
 
78
87
  continue.call(request) do |response|
79
88
  # Check if not a CORS prefetch and finish up if not
80
89
  if !response.body&.empty? && response.body
81
90
  check_passed = true
82
91
 
83
- if !additional_search_parameters.nil? && post_data_include.nil?
84
- body_to_check = Oj.load(response.body)
85
-
86
- search_parameters = additional_search_parameters.split(",")
87
- search_parameters.each_with_index do |key, index|
88
- break if body_to_check.nil?
89
-
90
- check_passed = false unless body_to_check.has_key?(key)
91
- body_to_check = body_to_check[key]
92
- end
93
- end
94
-
95
92
  next if check_passed == false
96
93
  response_body = response.body if check_passed == true
97
94
  end
@@ -108,42 +105,17 @@ module Zorki
108
105
  page.driver.browser.navigate.to(url)
109
106
  # We wait until the correct intercept is processed or we've waited 60 seconds
110
107
  start_time = Time.now
111
- # puts "Waiting.... #{url}"
112
-
113
- sleep(rand(1...10))
114
108
  while response_body.nil? && (Time.now - start_time) < 60
115
109
  sleep(0.1)
116
110
  end
117
111
 
118
112
  page.driver.execute_script("window.stop();")
119
113
 
120
- # If this is a page that has not been marked as misinfo we can just pull the data
121
- # TODO: put this before the whole load loop
122
- if response_body.nil?
123
- doc = Nokogiri::HTML(page.driver.browser.page_source)
124
- # elements = doc.search("script").find_all do |e|
125
- # e.attributes.has_key?("type") && e.attributes["type"].value == "application/ld+json"
126
- # end
127
-
128
- elements = doc.search("script").filter_map do |element|
129
- parsed_element_json = nil
130
- begin
131
- element_json = Oj.load(element.text)
132
- parsed_element_json = element_json["require"].last.last.first["__bbox"]["require"].first.last.last["__bbox"]["result"]["data"]["xdt_api__v1__media__shortcode__web_info"]
133
- rescue StandardError
134
- next
135
- end
136
-
137
- parsed_element_json
138
- end
139
-
140
- if elements&.empty?
141
- raise ContentUnavailableError.new("Cannot find anything", additional_data: { page_source: page.driver.browser.page_source, elements: elements })
142
- end
143
-
144
- return elements
145
- end
114
+ # 1. Fix the ability to dettect if a page is removed -DONE
115
+ # 2. Fix videos for slideshows - Works for reels?
116
+ # 3. Public liinks
146
117
 
118
+ # Check if something failed before we continue. Use the fake test to test
147
119
  raise ContentUnavailableError.new("Response body nil") if response_body.nil?
148
120
  Oj.load(response_body)
149
121
  ensure
@@ -26,7 +26,7 @@ module Zorki
26
26
 
27
27
  # This is searching for a specific request, the reason it's weird is because it's uri encoded
28
28
  # graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql/query", "data,user,media_count", post_data_include: "render_surface%22%3A%22PROFILE")
29
- graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql/query", nil, post_data_include: "render_surface%22%3A%22PROFILE")
29
+ graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql/query", nil, post_data_include: "render_surface%22%3A%22PROFILE", header: { "X-FB-Friendly-Name": "PolarisProfilePageContentQuery" })
30
30
  graphql_script = graphql_script.first if graphql_script.class == Array
31
31
 
32
32
  if graphql_script.nil?
data/lib/zorki/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Zorki
4
- VERSION = "0.2.5"
4
+ VERSION = "0.2.6"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: zorki
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.5
4
+ version: 0.2.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Christopher Guess
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-10-17 00:00:00.000000000 Z
11
+ date: 2024-10-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: capybara