zorki 0.2.5 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/zorki/scrapers/post_scraper.rb +1 -1
- data/lib/zorki/scrapers/scraper.rb +15 -43
- data/lib/zorki/scrapers/user_scraper.rb +1 -1
- data/lib/zorki/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 19f084d94393dd8e69f559d7f9bc84385d80b1ac2730c0e957f031d5e67da38f
|
4
|
+
data.tar.gz: f15fbb5c622bcd2940a1adda2748178d1314ac579e31b94d859979c6e507df76
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ea049add265fd88524d894995f5cc3eabdf98c6b9610969ff7344dc86ce2e679ac0f2d6a5e9f606b2111806789d6a783c23dff9596f8a7dc66be8543eb8a3453
|
7
|
+
data.tar.gz: e6d9760c423d59280cee47262d2d92e50312c0b858ee9de2dbf4aafa32ee1cb5d7634b3a55ff2b36b5fadb2be8cc52f3b63190239a61c2f1b03116bd3a83c75f
|
@@ -171,7 +171,7 @@ module Zorki
|
|
171
171
|
end
|
172
172
|
elsif object.has_key?("display_resources")
|
173
173
|
if object["is_video"] == true
|
174
|
-
video = Zorki.retrieve_media(object["
|
174
|
+
video = Zorki.retrieve_media(object["video_url"])
|
175
175
|
video_preview_image = Zorki.retrieve_media(object["display_url"])
|
176
176
|
else
|
177
177
|
images << Zorki.retrieve_media(object["display_resources"].last["src"])
|
@@ -57,7 +57,7 @@ module Zorki
|
|
57
57
|
# same type of search there as we use for users and simplify this whole thing a lot.
|
58
58
|
#
|
59
59
|
# @returns Hash a ruby hash of the JSON data
|
60
|
-
def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil, post_data_include: nil)
|
60
|
+
def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil, post_data_include: nil, header: nil)
|
61
61
|
# So this is fun:
|
62
62
|
# For pages marked as misinformation we have to use one method (interception of requrest) and
|
63
63
|
# for pages that are not, we can just pull the data straight from the page.
|
@@ -73,25 +73,22 @@ module Zorki
|
|
73
73
|
page.driver.browser.intercept do |request, &continue|
|
74
74
|
# This passes the request forward unmodified, since we only care about the response
|
75
75
|
continue.call(request) && next unless request.url.include?(subpage_search)
|
76
|
-
|
76
|
+
if !header.nil?
|
77
|
+
header_key = header.keys.first.to_s
|
78
|
+
header_value = header.values.first
|
79
|
+
|
80
|
+
puts "Request Header included? #{request.headers.include?(header_key)} #{request.headers[header_key]} == #{header_value}"
|
81
|
+
continue.call(request) && next unless request.headers.include?(header_key) && request.headers[header_key] == header_value
|
82
|
+
|
83
|
+
elsif !post_data_include.nil?
|
84
|
+
continue.call(request) && next unless request.post_data&.include?(post_data_include)
|
85
|
+
end
|
77
86
|
|
78
87
|
continue.call(request) do |response|
|
79
88
|
# Check if not a CORS prefetch and finish up if not
|
80
89
|
if !response.body&.empty? && response.body
|
81
90
|
check_passed = true
|
82
91
|
|
83
|
-
if !additional_search_parameters.nil? && post_data_include.nil?
|
84
|
-
body_to_check = Oj.load(response.body)
|
85
|
-
|
86
|
-
search_parameters = additional_search_parameters.split(",")
|
87
|
-
search_parameters.each_with_index do |key, index|
|
88
|
-
break if body_to_check.nil?
|
89
|
-
|
90
|
-
check_passed = false unless body_to_check.has_key?(key)
|
91
|
-
body_to_check = body_to_check[key]
|
92
|
-
end
|
93
|
-
end
|
94
|
-
|
95
92
|
next if check_passed == false
|
96
93
|
response_body = response.body if check_passed == true
|
97
94
|
end
|
@@ -108,42 +105,17 @@ module Zorki
|
|
108
105
|
page.driver.browser.navigate.to(url)
|
109
106
|
# We wait until the correct intercept is processed or we've waited 60 seconds
|
110
107
|
start_time = Time.now
|
111
|
-
# puts "Waiting.... #{url}"
|
112
|
-
|
113
|
-
sleep(rand(1...10))
|
114
108
|
while response_body.nil? && (Time.now - start_time) < 60
|
115
109
|
sleep(0.1)
|
116
110
|
end
|
117
111
|
|
118
112
|
page.driver.execute_script("window.stop();")
|
119
113
|
|
120
|
-
#
|
121
|
-
#
|
122
|
-
|
123
|
-
doc = Nokogiri::HTML(page.driver.browser.page_source)
|
124
|
-
# elements = doc.search("script").find_all do |e|
|
125
|
-
# e.attributes.has_key?("type") && e.attributes["type"].value == "application/ld+json"
|
126
|
-
# end
|
127
|
-
|
128
|
-
elements = doc.search("script").filter_map do |element|
|
129
|
-
parsed_element_json = nil
|
130
|
-
begin
|
131
|
-
element_json = Oj.load(element.text)
|
132
|
-
parsed_element_json = element_json["require"].last.last.first["__bbox"]["require"].first.last.last["__bbox"]["result"]["data"]["xdt_api__v1__media__shortcode__web_info"]
|
133
|
-
rescue StandardError
|
134
|
-
next
|
135
|
-
end
|
136
|
-
|
137
|
-
parsed_element_json
|
138
|
-
end
|
139
|
-
|
140
|
-
if elements&.empty?
|
141
|
-
raise ContentUnavailableError.new("Cannot find anything", additional_data: { page_source: page.driver.browser.page_source, elements: elements })
|
142
|
-
end
|
143
|
-
|
144
|
-
return elements
|
145
|
-
end
|
114
|
+
# 1. Fix the ability to dettect if a page is removed -DONE
|
115
|
+
# 2. Fix videos for slideshows - Works for reels?
|
116
|
+
# 3. Public liinks
|
146
117
|
|
118
|
+
# Check if something failed before we continue. Use the fake test to test
|
147
119
|
raise ContentUnavailableError.new("Response body nil") if response_body.nil?
|
148
120
|
Oj.load(response_body)
|
149
121
|
ensure
|
@@ -26,7 +26,7 @@ module Zorki
|
|
26
26
|
|
27
27
|
# This is searching for a specific request, the reason it's weird is because it's uri encoded
|
28
28
|
# graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql/query", "data,user,media_count", post_data_include: "render_surface%22%3A%22PROFILE")
|
29
|
-
graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql/query", nil, post_data_include: "render_surface%22%3A%22PROFILE")
|
29
|
+
graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql/query", nil, post_data_include: "render_surface%22%3A%22PROFILE", header: { "X-FB-Friendly-Name": "PolarisProfilePageContentQuery" })
|
30
30
|
graphql_script = graphql_script.first if graphql_script.class == Array
|
31
31
|
|
32
32
|
if graphql_script.nil?
|
data/lib/zorki/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zorki
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Christopher Guess
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-10-
|
11
|
+
date: 2024-10-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|