zorki 0.1.27 → 0.1.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/zorki/scrapers/post_scraper.rb +4 -0
- data/lib/zorki/scrapers/scraper.rb +12 -8
- data/lib/zorki/scrapers/user_scraper.rb +2 -3
- data/lib/zorki/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0fb9866c1d2efb0e686e6c0edd4f268c452cc18ed2f2481b46cbc1b8f2c02445
|
4
|
+
data.tar.gz: bafdf519a9b2ed1c5fb2f0711ebbf7bf7909e32769290bfe6286a0463056edc7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 13f0bce3dbe9ee6d029f79569a27d287c6679643aa0fcdbc3e176a5667d214664eae046e4f2700aab712f4f3b2e96c5535f3d05c6204fe2856c0101b911be5f6
|
7
|
+
data.tar.gz: 6279ee4bb40c5ad8a6e74be86343027d5b7b122af763274dad96eb3c60d46b30de14acc7f6e57b70b5532888f022b1bcc4db5a8b87d0281471bab519a9faf067
|
@@ -40,6 +40,9 @@ module Zorki
|
|
40
40
|
Capybara.app_host = "https://instagram.com"
|
41
41
|
|
42
42
|
# video slideshows https://www.instagram.com/p/CY7KxwYOFBS/?utm_source=ig_embed&utm_campaign=loading
|
43
|
+
#
|
44
|
+
# TODO: Check if post is available publically before trying to login
|
45
|
+
# Should help with the scraping
|
43
46
|
login
|
44
47
|
graphql_object = get_content_of_subpage_from_url(
|
45
48
|
"https://www.instagram.com/p/#{id}/",
|
@@ -149,6 +152,7 @@ module Zorki
|
|
149
152
|
end
|
150
153
|
|
151
154
|
# Take the screenshot and return it
|
155
|
+
# rubocop:disable Link/Debugger
|
152
156
|
save_screenshot("#{Zorki.temp_storage_location}/instagram_screenshot_#{SecureRandom.uuid}.png")
|
153
157
|
end
|
154
158
|
end
|
@@ -70,13 +70,12 @@ module Zorki
|
|
70
70
|
# the one we want, and then moves on.
|
71
71
|
response_body = nil
|
72
72
|
|
73
|
-
responses = []
|
74
73
|
page.driver.browser.intercept do |request, &continue|
|
75
74
|
# This passes the request forward unmodified, since we only care about the response
|
76
75
|
#
|
77
76
|
# responses.first.post_data.include?("render_surface%22%3A%22PROFILE")
|
78
77
|
continue.call(request) && next unless request.url.include?(subpage_search)
|
79
|
-
continue.call(request) && next unless !post_data_include.nil? && request.post_data
|
78
|
+
continue.call(request) && next unless !post_data_include.nil? && request.post_data&.include?(post_data_include)
|
80
79
|
|
81
80
|
continue.call(request) do |response|
|
82
81
|
puts "***********************************************************"
|
@@ -113,6 +112,10 @@ module Zorki
|
|
113
112
|
end
|
114
113
|
rescue Selenium::WebDriver::Error::WebDriverError
|
115
114
|
# Eat them
|
115
|
+
rescue StandardError => e
|
116
|
+
puts "***********************************************************"
|
117
|
+
puts "Error in intercept: #{e}"
|
118
|
+
puts "***********************************************************"
|
116
119
|
end
|
117
120
|
|
118
121
|
# Now that the intercept is set up, we visit the page we want
|
@@ -131,6 +134,7 @@ module Zorki
|
|
131
134
|
# If this is a page that has not been marked as misinfo we can just pull the data
|
132
135
|
# TODO: put this before the whole load loop
|
133
136
|
if response_body.nil?
|
137
|
+
|
134
138
|
doc = Nokogiri::HTML(page.driver.browser.page_source)
|
135
139
|
# elements = doc.search("script").find_all do |e|
|
136
140
|
# e.attributes.has_key?("type") && e.attributes["type"].value == "application/ld+json"
|
@@ -142,13 +146,13 @@ module Zorki
|
|
142
146
|
element_json = OJ.load(element.text)
|
143
147
|
|
144
148
|
# if element.text.include?("jokoy.komi.io")
|
145
|
-
|
146
|
-
|
149
|
+
# debugger
|
150
|
+
# if element_json["require"].first.last.first["__bbox"].key?("require")
|
147
151
|
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
+
# element_json["require"].first.last.first["__bbox"]["require"].each do |x|
|
153
|
+
# debugger if x.to_s.include?("Si mulut pelaut")
|
154
|
+
# end
|
155
|
+
# end
|
152
156
|
# end
|
153
157
|
|
154
158
|
parsed_element_json = element_json["require"].last.last.first["__bbox"]["require"].first.last.last["__bbox"]["result"]["data"]["xdt_api__v1__media__shortcode__web_info"]
|
@@ -30,7 +30,7 @@ module Zorki
|
|
30
30
|
if graphql_script.nil?
|
31
31
|
graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "web_profile_info")
|
32
32
|
end
|
33
|
-
rescue Zorki::ContentUnavailableError
|
33
|
+
rescue Zorki::ContentUnavailableError
|
34
34
|
count += 1
|
35
35
|
|
36
36
|
if count > 3
|
@@ -100,8 +100,7 @@ module Zorki
|
|
100
100
|
profile_image_url: profile_image_url
|
101
101
|
}
|
102
102
|
end
|
103
|
-
rescue Zorki::ContentUnavailableError
|
104
|
-
debugger
|
103
|
+
rescue Zorki::ContentUnavailableError
|
105
104
|
raise Zorki::UserScrapingError.new("Zorki could not find user #{username}", additional_data: { username: username })
|
106
105
|
end
|
107
106
|
end
|
data/lib/zorki/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zorki
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.28
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Christopher Guess
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-10-
|
11
|
+
date: 2024-10-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|