zorki 0.1.26 → 0.1.28
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/zorki/scrapers/post_scraper.rb +4 -0
- data/lib/zorki/scrapers/scraper.rb +37 -14
- data/lib/zorki/scrapers/user_scraper.rb +9 -7
- data/lib/zorki/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0fb9866c1d2efb0e686e6c0edd4f268c452cc18ed2f2481b46cbc1b8f2c02445
|
4
|
+
data.tar.gz: bafdf519a9b2ed1c5fb2f0711ebbf7bf7909e32769290bfe6286a0463056edc7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 13f0bce3dbe9ee6d029f79569a27d287c6679643aa0fcdbc3e176a5667d214664eae046e4f2700aab712f4f3b2e96c5535f3d05c6204fe2856c0101b911be5f6
|
7
|
+
data.tar.gz: 6279ee4bb40c5ad8a6e74be86343027d5b7b122af763274dad96eb3c60d46b30de14acc7f6e57b70b5532888f022b1bcc4db5a8b87d0281471bab519a9faf067
|
data/Gemfile.lock
CHANGED
@@ -40,6 +40,9 @@ module Zorki
|
|
40
40
|
Capybara.app_host = "https://instagram.com"
|
41
41
|
|
42
42
|
# video slideshows https://www.instagram.com/p/CY7KxwYOFBS/?utm_source=ig_embed&utm_campaign=loading
|
43
|
+
#
|
44
|
+
# TODO: Check if post is available publically before trying to login
|
45
|
+
# Should help with the scraping
|
43
46
|
login
|
44
47
|
graphql_object = get_content_of_subpage_from_url(
|
45
48
|
"https://www.instagram.com/p/#{id}/",
|
@@ -149,6 +152,7 @@ module Zorki
|
|
149
152
|
end
|
150
153
|
|
151
154
|
# Take the screenshot and return it
|
155
|
+
# rubocop:disable Link/Debugger
|
152
156
|
save_screenshot("#{Zorki.temp_storage_location}/instagram_screenshot_#{SecureRandom.uuid}.png")
|
153
157
|
end
|
154
158
|
end
|
@@ -51,8 +51,13 @@ module Zorki
|
|
51
51
|
# additional_search_params is a comma seperated keys
|
52
52
|
# example: `data,xdt_api__v1__media__shortcode__web_info,items`
|
53
53
|
#
|
54
|
+
# NOTE: `post_data_include` if not nil overrules the additional_search_parameters
|
55
|
+
# This is so that i didn't have to refactor the entire code base when I added it.
|
56
|
+
# Eventually it might be better to look at the post request and see if we can do the
|
57
|
+
# same type of search there as we use for users and simplify this whole thing a lot.
|
58
|
+
#
|
54
59
|
# @returns Hash a ruby hash of the JSON data
|
55
|
-
def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil)
|
60
|
+
def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil, post_data_include: nil)
|
56
61
|
# So this is fun:
|
57
62
|
# For pages marked as misinformation we have to use one method (interception of requrest) and
|
58
63
|
# for pages that are not, we can just pull the data straight from the page.
|
@@ -67,19 +72,25 @@ module Zorki
|
|
67
72
|
|
68
73
|
page.driver.browser.intercept do |request, &continue|
|
69
74
|
# This passes the request forward unmodified, since we only care about the response
|
70
|
-
#
|
75
|
+
#
|
76
|
+
# responses.first.post_data.include?("render_surface%22%3A%22PROFILE")
|
71
77
|
continue.call(request) && next unless request.url.include?(subpage_search)
|
78
|
+
continue.call(request) && next unless !post_data_include.nil? && request.post_data&.include?(post_data_include)
|
72
79
|
|
73
80
|
continue.call(request) do |response|
|
81
|
+
puts "***********************************************************"
|
82
|
+
puts "checking request: #{request.url}"
|
83
|
+
puts response.body
|
84
|
+
puts "***********************************************************"
|
85
|
+
|
86
|
+
# responses << response
|
74
87
|
# Check if not a CORS prefetch and finish up if not
|
75
88
|
if !response.body&.empty? && response.body
|
76
89
|
check_passed = true
|
77
90
|
|
78
|
-
|
91
|
+
if !additional_search_parameters.nil? && post_data_include.nil?
|
79
92
|
body_to_check = Oj.load(response.body)
|
80
93
|
|
81
|
-
debugger if body_to_check.include?("jokoy.komi.io")
|
82
|
-
|
83
94
|
search_parameters = additional_search_parameters.split(",")
|
84
95
|
search_parameters.each_with_index do |key, index|
|
85
96
|
break if body_to_check.nil?
|
@@ -89,11 +100,22 @@ module Zorki
|
|
89
100
|
end
|
90
101
|
end
|
91
102
|
|
103
|
+
if check_passed == false
|
104
|
+
puts "***********************************************************"
|
105
|
+
puts "checking FAILED request: #{request.url}"
|
106
|
+
puts response.body
|
107
|
+
puts "***********************************************************"
|
108
|
+
end
|
109
|
+
|
92
110
|
response_body = response.body if check_passed == true
|
93
111
|
end
|
94
112
|
end
|
95
113
|
rescue Selenium::WebDriver::Error::WebDriverError
|
96
114
|
# Eat them
|
115
|
+
rescue StandardError => e
|
116
|
+
puts "***********************************************************"
|
117
|
+
puts "Error in intercept: #{e}"
|
118
|
+
puts "***********************************************************"
|
97
119
|
end
|
98
120
|
|
99
121
|
# Now that the intercept is set up, we visit the page we want
|
@@ -112,6 +134,7 @@ module Zorki
|
|
112
134
|
# If this is a page that has not been marked as misinfo we can just pull the data
|
113
135
|
# TODO: put this before the whole load loop
|
114
136
|
if response_body.nil?
|
137
|
+
|
115
138
|
doc = Nokogiri::HTML(page.driver.browser.page_source)
|
116
139
|
# elements = doc.search("script").find_all do |e|
|
117
140
|
# e.attributes.has_key?("type") && e.attributes["type"].value == "application/ld+json"
|
@@ -120,17 +143,17 @@ module Zorki
|
|
120
143
|
elements = doc.search("script").filter_map do |element|
|
121
144
|
parsed_element_json = nil
|
122
145
|
begin
|
123
|
-
element_json =
|
146
|
+
element_json = OJ.load(element.text)
|
124
147
|
|
125
|
-
if element.text.include?("jokoy.komi.io")
|
126
|
-
|
127
|
-
|
148
|
+
# if element.text.include?("jokoy.komi.io")
|
149
|
+
# debugger
|
150
|
+
# if element_json["require"].first.last.first["__bbox"].key?("require")
|
128
151
|
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
end
|
152
|
+
# element_json["require"].first.last.first["__bbox"]["require"].each do |x|
|
153
|
+
# debugger if x.to_s.include?("Si mulut pelaut")
|
154
|
+
# end
|
155
|
+
# end
|
156
|
+
# end
|
134
157
|
|
135
158
|
parsed_element_json = element_json["require"].last.last.first["__bbox"]["require"].first.last.last["__bbox"]["result"]["data"]["xdt_api__v1__media__shortcode__web_info"]
|
136
159
|
rescue StandardError
|
@@ -20,22 +20,25 @@ module Zorki
|
|
20
20
|
graphql_script = nil
|
21
21
|
count = 0
|
22
22
|
loop do
|
23
|
-
raise Zorki::UserScrapingError.new("Zorki could not find user #{username}", additional_data: { username: username }) if count > 3
|
24
|
-
|
25
23
|
print "Scraping user #{username}... (attempt #{count + 1})\n"
|
26
24
|
begin
|
27
25
|
login
|
28
26
|
|
29
|
-
graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql", "data,user,media_count")
|
27
|
+
graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql/query", "data,user,media_count", post_data_include: "render_surface")
|
30
28
|
graphql_script = graphql_script.first if graphql_script.class == Array
|
31
29
|
|
32
30
|
if graphql_script.nil?
|
33
31
|
graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "web_profile_info")
|
34
32
|
end
|
35
|
-
rescue Zorki::ContentUnavailableError
|
33
|
+
rescue Zorki::ContentUnavailableError
|
36
34
|
count += 1
|
35
|
+
|
36
|
+
if count > 3
|
37
|
+
raise Zorki::UserScrapingError.new("Zorki could not find user #{username}", additional_data: { username: username })
|
38
|
+
end
|
39
|
+
|
37
40
|
page.driver.browser.navigate.to("https://www.instagram.com") # we want to go back to the main page so we start from scratch
|
38
|
-
sleep 10
|
41
|
+
sleep rand(5..10)
|
39
42
|
next
|
40
43
|
end
|
41
44
|
|
@@ -97,8 +100,7 @@ module Zorki
|
|
97
100
|
profile_image_url: profile_image_url
|
98
101
|
}
|
99
102
|
end
|
100
|
-
rescue Zorki::ContentUnavailableError
|
101
|
-
debugger
|
103
|
+
rescue Zorki::ContentUnavailableError
|
102
104
|
raise Zorki::UserScrapingError.new("Zorki could not find user #{username}", additional_data: { username: username })
|
103
105
|
end
|
104
106
|
end
|
data/lib/zorki/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zorki
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.28
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Christopher Guess
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-10-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|