zorki 0.2.4 → 0.2.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/zorki/scrapers/post_scraper.rb +15 -3
- data/lib/zorki/scrapers/scraper.rb +15 -43
- data/lib/zorki/scrapers/user_scraper.rb +1 -1
- data/lib/zorki/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 19f084d94393dd8e69f559d7f9bc84385d80b1ac2730c0e957f031d5e67da38f
|
4
|
+
data.tar.gz: f15fbb5c622bcd2940a1adda2748178d1314ac579e31b94d859979c6e507df76
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ea049add265fd88524d894995f5cc3eabdf98c6b9610969ff7344dc86ce2e679ac0f2d6a5e9f606b2111806789d6a783c23dff9596f8a7dc66be8543eb8a3453
|
7
|
+
data.tar.gz: e6d9760c423d59280cee47262d2d92e50312c0b858ee9de2dbf4aafa32ee1cb5d7634b3a55ff2b36b5fadb2be8cc52f3b63190239a61c2f1b03116bd3a83c75f
|
@@ -135,9 +135,21 @@ module Zorki
|
|
135
135
|
# Go through the entire JSON structure (below for now) and make sure it hits all the points
|
136
136
|
|
137
137
|
object = graphql_object["data"]["xdt_shortcode_media"]
|
138
|
-
|
138
|
+
|
139
|
+
begin
|
140
|
+
date = object["edge_media_to_caption"]["edges"].first["node"]["created_at"]
|
141
|
+
rescue StandardError
|
142
|
+
date = object["taken_at_timestamp"].to_s
|
143
|
+
end
|
144
|
+
|
139
145
|
date = DateTime.strptime(date, "%s")
|
140
|
-
|
146
|
+
|
147
|
+
begin
|
148
|
+
text = object["edge_media_to_caption"]["edges"].first["node"]["text"]
|
149
|
+
rescue StandardError
|
150
|
+
text = ""
|
151
|
+
end
|
152
|
+
|
141
153
|
number_of_likes = object["edge_media_preview_like"]["count"]
|
142
154
|
username = object["owner"]["username"]
|
143
155
|
id = object["shortcode"]
|
@@ -159,7 +171,7 @@ module Zorki
|
|
159
171
|
end
|
160
172
|
elsif object.has_key?("display_resources")
|
161
173
|
if object["is_video"] == true
|
162
|
-
video = Zorki.retrieve_media(object["
|
174
|
+
video = Zorki.retrieve_media(object["video_url"])
|
163
175
|
video_preview_image = Zorki.retrieve_media(object["display_url"])
|
164
176
|
else
|
165
177
|
images << Zorki.retrieve_media(object["display_resources"].last["src"])
|
@@ -57,7 +57,7 @@ module Zorki
|
|
57
57
|
# same type of search there as we use for users and simplify this whole thing a lot.
|
58
58
|
#
|
59
59
|
# @returns Hash a ruby hash of the JSON data
|
60
|
-
def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil, post_data_include: nil)
|
60
|
+
def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil, post_data_include: nil, header: nil)
|
61
61
|
# So this is fun:
|
62
62
|
# For pages marked as misinformation we have to use one method (interception of requrest) and
|
63
63
|
# for pages that are not, we can just pull the data straight from the page.
|
@@ -73,25 +73,22 @@ module Zorki
|
|
73
73
|
page.driver.browser.intercept do |request, &continue|
|
74
74
|
# This passes the request forward unmodified, since we only care about the response
|
75
75
|
continue.call(request) && next unless request.url.include?(subpage_search)
|
76
|
-
|
76
|
+
if !header.nil?
|
77
|
+
header_key = header.keys.first.to_s
|
78
|
+
header_value = header.values.first
|
79
|
+
|
80
|
+
puts "Request Header included? #{request.headers.include?(header_key)} #{request.headers[header_key]} == #{header_value}"
|
81
|
+
continue.call(request) && next unless request.headers.include?(header_key) && request.headers[header_key] == header_value
|
82
|
+
|
83
|
+
elsif !post_data_include.nil?
|
84
|
+
continue.call(request) && next unless request.post_data&.include?(post_data_include)
|
85
|
+
end
|
77
86
|
|
78
87
|
continue.call(request) do |response|
|
79
88
|
# Check if not a CORS prefetch and finish up if not
|
80
89
|
if !response.body&.empty? && response.body
|
81
90
|
check_passed = true
|
82
91
|
|
83
|
-
if !additional_search_parameters.nil? && post_data_include.nil?
|
84
|
-
body_to_check = Oj.load(response.body)
|
85
|
-
|
86
|
-
search_parameters = additional_search_parameters.split(",")
|
87
|
-
search_parameters.each_with_index do |key, index|
|
88
|
-
break if body_to_check.nil?
|
89
|
-
|
90
|
-
check_passed = false unless body_to_check.has_key?(key)
|
91
|
-
body_to_check = body_to_check[key]
|
92
|
-
end
|
93
|
-
end
|
94
|
-
|
95
92
|
next if check_passed == false
|
96
93
|
response_body = response.body if check_passed == true
|
97
94
|
end
|
@@ -108,42 +105,17 @@ module Zorki
|
|
108
105
|
page.driver.browser.navigate.to(url)
|
109
106
|
# We wait until the correct intercept is processed or we've waited 60 seconds
|
110
107
|
start_time = Time.now
|
111
|
-
# puts "Waiting.... #{url}"
|
112
|
-
|
113
|
-
sleep(rand(1...10))
|
114
108
|
while response_body.nil? && (Time.now - start_time) < 60
|
115
109
|
sleep(0.1)
|
116
110
|
end
|
117
111
|
|
118
112
|
page.driver.execute_script("window.stop();")
|
119
113
|
|
120
|
-
#
|
121
|
-
#
|
122
|
-
|
123
|
-
doc = Nokogiri::HTML(page.driver.browser.page_source)
|
124
|
-
# elements = doc.search("script").find_all do |e|
|
125
|
-
# e.attributes.has_key?("type") && e.attributes["type"].value == "application/ld+json"
|
126
|
-
# end
|
127
|
-
|
128
|
-
elements = doc.search("script").filter_map do |element|
|
129
|
-
parsed_element_json = nil
|
130
|
-
begin
|
131
|
-
element_json = Oj.load(element.text)
|
132
|
-
parsed_element_json = element_json["require"].last.last.first["__bbox"]["require"].first.last.last["__bbox"]["result"]["data"]["xdt_api__v1__media__shortcode__web_info"]
|
133
|
-
rescue StandardError
|
134
|
-
next
|
135
|
-
end
|
136
|
-
|
137
|
-
parsed_element_json
|
138
|
-
end
|
139
|
-
|
140
|
-
if elements&.empty?
|
141
|
-
raise ContentUnavailableError.new("Cannot find anything", additional_data: { page_source: page.driver.browser.page_source, elements: elements })
|
142
|
-
end
|
143
|
-
|
144
|
-
return elements
|
145
|
-
end
|
114
|
+
# 1. Fix the ability to dettect if a page is removed -DONE
|
115
|
+
# 2. Fix videos for slideshows - Works for reels?
|
116
|
+
# 3. Public liinks
|
146
117
|
|
118
|
+
# Check if something failed before we continue. Use the fake test to test
|
147
119
|
raise ContentUnavailableError.new("Response body nil") if response_body.nil?
|
148
120
|
Oj.load(response_body)
|
149
121
|
ensure
|
@@ -26,7 +26,7 @@ module Zorki
|
|
26
26
|
|
27
27
|
# This is searching for a specific request, the reason it's weird is because it's uri encoded
|
28
28
|
# graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql/query", "data,user,media_count", post_data_include: "render_surface%22%3A%22PROFILE")
|
29
|
-
graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql/query", nil, post_data_include: "render_surface%22%3A%22PROFILE")
|
29
|
+
graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql/query", nil, post_data_include: "render_surface%22%3A%22PROFILE", header: { "X-FB-Friendly-Name": "PolarisProfilePageContentQuery" })
|
30
30
|
graphql_script = graphql_script.first if graphql_script.class == Array
|
31
31
|
|
32
32
|
if graphql_script.nil?
|
data/lib/zorki/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zorki
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Christopher Guess
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-10-
|
11
|
+
date: 2024-10-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|