zorki 0.2.5 → 0.2.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +3 -3
- data/lib/zorki/scrapers/post_scraper.rb +1 -1
- data/lib/zorki/scrapers/scraper.rb +15 -43
- data/lib/zorki/scrapers/user_scraper.rb +1 -1
- data/lib/zorki/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 47ee4ce0e4bc429103d8086157b53e1db6d21d8ecaf50872913d52fe18e76f0d
|
4
|
+
data.tar.gz: 930e34afa3d53c82991e2deddc3a3e5d6f7062a7119bc8a86787a98e7c42c16f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6abd258c185bd5817d97d29baf9be8751cf970e52d5431bf3dae890adbf153bcc9410487698f5e4d6b260fe3d68fb78c81704817480faec2f770c598c23c60cd
|
7
|
+
data.tar.gz: 8ffee00d7d7e505d4f5c84193463737ffe58a5d81a26ceb3778f0712f79e77662059445a39d152e166bf554919610120a8c9f36e0f87cce43032f4db8e62d133
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
zorki (0.
|
4
|
+
zorki (0.2.6)
|
5
5
|
apparition
|
6
6
|
capybara
|
7
7
|
oj
|
@@ -87,7 +87,7 @@ GEM
|
|
87
87
|
regexp_parser (2.9.2)
|
88
88
|
reline (0.5.10)
|
89
89
|
io-console (~> 0.5)
|
90
|
-
rexml (3.3.
|
90
|
+
rexml (3.3.8)
|
91
91
|
rubocop (1.66.1)
|
92
92
|
json (~> 2.3)
|
93
93
|
language_server-protocol (>= 3.17.0)
|
@@ -125,7 +125,7 @@ GEM
|
|
125
125
|
ruby-progressbar (1.13.0)
|
126
126
|
rubyzip (2.3.2)
|
127
127
|
securerandom (0.3.1)
|
128
|
-
selenium-devtools (0.
|
128
|
+
selenium-devtools (0.129.0)
|
129
129
|
selenium-webdriver (~> 4.2)
|
130
130
|
selenium-webdriver (4.24.0)
|
131
131
|
base64 (~> 0.2)
|
@@ -171,7 +171,7 @@ module Zorki
|
|
171
171
|
end
|
172
172
|
elsif object.has_key?("display_resources")
|
173
173
|
if object["is_video"] == true
|
174
|
-
video = Zorki.retrieve_media(object["
|
174
|
+
video = Zorki.retrieve_media(object["video_url"])
|
175
175
|
video_preview_image = Zorki.retrieve_media(object["display_url"])
|
176
176
|
else
|
177
177
|
images << Zorki.retrieve_media(object["display_resources"].last["src"])
|
@@ -57,7 +57,7 @@ module Zorki
|
|
57
57
|
# same type of search there as we use for users and simplify this whole thing a lot.
|
58
58
|
#
|
59
59
|
# @returns Hash a ruby hash of the JSON data
|
60
|
-
def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil, post_data_include: nil)
|
60
|
+
def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil, post_data_include: nil, header: nil)
|
61
61
|
# So this is fun:
|
62
62
|
# For pages marked as misinformation we have to use one method (interception of requrest) and
|
63
63
|
# for pages that are not, we can just pull the data straight from the page.
|
@@ -73,25 +73,22 @@ module Zorki
|
|
73
73
|
page.driver.browser.intercept do |request, &continue|
|
74
74
|
# This passes the request forward unmodified, since we only care about the response
|
75
75
|
continue.call(request) && next unless request.url.include?(subpage_search)
|
76
|
-
|
76
|
+
if !header.nil?
|
77
|
+
header_key = header.keys.first.to_s
|
78
|
+
header_value = header.values.first
|
79
|
+
|
80
|
+
puts "Request Header included? #{request.headers.include?(header_key)} #{request.headers[header_key]} == #{header_value}"
|
81
|
+
continue.call(request) && next unless request.headers.include?(header_key) && request.headers[header_key] == header_value
|
82
|
+
|
83
|
+
elsif !post_data_include.nil?
|
84
|
+
continue.call(request) && next unless request.post_data&.include?(post_data_include)
|
85
|
+
end
|
77
86
|
|
78
87
|
continue.call(request) do |response|
|
79
88
|
# Check if not a CORS prefetch and finish up if not
|
80
89
|
if !response.body&.empty? && response.body
|
81
90
|
check_passed = true
|
82
91
|
|
83
|
-
if !additional_search_parameters.nil? && post_data_include.nil?
|
84
|
-
body_to_check = Oj.load(response.body)
|
85
|
-
|
86
|
-
search_parameters = additional_search_parameters.split(",")
|
87
|
-
search_parameters.each_with_index do |key, index|
|
88
|
-
break if body_to_check.nil?
|
89
|
-
|
90
|
-
check_passed = false unless body_to_check.has_key?(key)
|
91
|
-
body_to_check = body_to_check[key]
|
92
|
-
end
|
93
|
-
end
|
94
|
-
|
95
92
|
next if check_passed == false
|
96
93
|
response_body = response.body if check_passed == true
|
97
94
|
end
|
@@ -108,42 +105,17 @@ module Zorki
|
|
108
105
|
page.driver.browser.navigate.to(url)
|
109
106
|
# We wait until the correct intercept is processed or we've waited 60 seconds
|
110
107
|
start_time = Time.now
|
111
|
-
# puts "Waiting.... #{url}"
|
112
|
-
|
113
|
-
sleep(rand(1...10))
|
114
108
|
while response_body.nil? && (Time.now - start_time) < 60
|
115
109
|
sleep(0.1)
|
116
110
|
end
|
117
111
|
|
118
112
|
page.driver.execute_script("window.stop();")
|
119
113
|
|
120
|
-
#
|
121
|
-
#
|
122
|
-
|
123
|
-
doc = Nokogiri::HTML(page.driver.browser.page_source)
|
124
|
-
# elements = doc.search("script").find_all do |e|
|
125
|
-
# e.attributes.has_key?("type") && e.attributes["type"].value == "application/ld+json"
|
126
|
-
# end
|
127
|
-
|
128
|
-
elements = doc.search("script").filter_map do |element|
|
129
|
-
parsed_element_json = nil
|
130
|
-
begin
|
131
|
-
element_json = Oj.load(element.text)
|
132
|
-
parsed_element_json = element_json["require"].last.last.first["__bbox"]["require"].first.last.last["__bbox"]["result"]["data"]["xdt_api__v1__media__shortcode__web_info"]
|
133
|
-
rescue StandardError
|
134
|
-
next
|
135
|
-
end
|
136
|
-
|
137
|
-
parsed_element_json
|
138
|
-
end
|
139
|
-
|
140
|
-
if elements&.empty?
|
141
|
-
raise ContentUnavailableError.new("Cannot find anything", additional_data: { page_source: page.driver.browser.page_source, elements: elements })
|
142
|
-
end
|
143
|
-
|
144
|
-
return elements
|
145
|
-
end
|
114
|
+
# 1. Fix the ability to dettect if a page is removed -DONE
|
115
|
+
# 2. Fix videos for slideshows - Works for reels?
|
116
|
+
# 3. Public liinks
|
146
117
|
|
118
|
+
# Check if something failed before we continue. Use the fake test to test
|
147
119
|
raise ContentUnavailableError.new("Response body nil") if response_body.nil?
|
148
120
|
Oj.load(response_body)
|
149
121
|
ensure
|
@@ -26,7 +26,7 @@ module Zorki
|
|
26
26
|
|
27
27
|
# This is searching for a specific request, the reason it's weird is because it's uri encoded
|
28
28
|
# graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql/query", "data,user,media_count", post_data_include: "render_surface%22%3A%22PROFILE")
|
29
|
-
graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql/query", nil, post_data_include: "render_surface%22%3A%22PROFILE")
|
29
|
+
graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql/query", nil, post_data_include: "render_surface%22%3A%22PROFILE", header: { "X-FB-Friendly-Name": "PolarisProfilePageContentQuery" })
|
30
30
|
graphql_script = graphql_script.first if graphql_script.class == Array
|
31
31
|
|
32
32
|
if graphql_script.nil?
|
data/lib/zorki/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zorki
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Christopher Guess
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-10-
|
11
|
+
date: 2024-10-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|