zorki 0.1.27 → 0.1.29
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/zorki/scrapers/post_scraper.rb +4 -1
- data/lib/zorki/scrapers/scraper.rb +12 -10
- data/lib/zorki/scrapers/user_scraper.rb +5 -4
- data/lib/zorki/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 684dbf1fee604da4f2ab9687223bf3aea6f1ebad0a4f9e16ef7c80dafa645ac0
|
4
|
+
data.tar.gz: ab628433652062674b97318864663082cf5c7e623d32ff4ec87494d54df79788
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cfec123b6b4f1abb61c77c963b4162a9ef87273e3c3bc1dc7a67be34a5af5c38dbc70f4a43d3ae5690a3d5655a54d3081545352eb0768b926d598a575c1c3654
|
7
|
+
data.tar.gz: d7cc5599d995dcacc5d38dc60669f67f75d6005f282d9013ceb12bdb463b87ddf78eb9a156bf5e52ad0d3f464f1adea89abc2cc0ae371b4cbcbb981b72124593
|
@@ -40,6 +40,9 @@ module Zorki
|
|
40
40
|
Capybara.app_host = "https://instagram.com"
|
41
41
|
|
42
42
|
# video slideshows https://www.instagram.com/p/CY7KxwYOFBS/?utm_source=ig_embed&utm_campaign=loading
|
43
|
+
#
|
44
|
+
# TODO: Check if post is available publically before trying to login
|
45
|
+
# Should help with the scraping
|
43
46
|
login
|
44
47
|
graphql_object = get_content_of_subpage_from_url(
|
45
48
|
"https://www.instagram.com/p/#{id}/",
|
@@ -47,7 +50,6 @@ module Zorki
|
|
47
50
|
"data,xdt_api__v1__media__shortcode__web_info,items"
|
48
51
|
)
|
49
52
|
|
50
|
-
|
51
53
|
graphql_object = graphql_object.first if graphql_object.kind_of?(Array)
|
52
54
|
|
53
55
|
# For pages that have been marked misinfo the structure is very different than not
|
@@ -149,6 +151,7 @@ module Zorki
|
|
149
151
|
end
|
150
152
|
|
151
153
|
# Take the screenshot and return it
|
154
|
+
# rubocop:disable Link/Debugger
|
152
155
|
save_screenshot("#{Zorki.temp_storage_location}/instagram_screenshot_#{SecureRandom.uuid}.png")
|
153
156
|
end
|
154
157
|
end
|
@@ -70,13 +70,10 @@ module Zorki
|
|
70
70
|
# the one we want, and then moves on.
|
71
71
|
response_body = nil
|
72
72
|
|
73
|
-
responses = []
|
74
73
|
page.driver.browser.intercept do |request, &continue|
|
75
74
|
# This passes the request forward unmodified, since we only care about the response
|
76
|
-
#
|
77
|
-
# responses.first.post_data.include?("render_surface%22%3A%22PROFILE")
|
78
75
|
continue.call(request) && next unless request.url.include?(subpage_search)
|
79
|
-
continue.call(request) && next unless !post_data_include.nil? && request.post_data
|
76
|
+
continue.call(request) && next unless !post_data_include.nil? && request.post_data&.include?(post_data_include)
|
80
77
|
|
81
78
|
continue.call(request) do |response|
|
82
79
|
puts "***********************************************************"
|
@@ -113,6 +110,10 @@ module Zorki
|
|
113
110
|
end
|
114
111
|
rescue Selenium::WebDriver::Error::WebDriverError
|
115
112
|
# Eat them
|
113
|
+
rescue StandardError => e
|
114
|
+
puts "***********************************************************"
|
115
|
+
puts "Error in intercept: #{e}"
|
116
|
+
puts "***********************************************************"
|
116
117
|
end
|
117
118
|
|
118
119
|
# Now that the intercept is set up, we visit the page we want
|
@@ -131,6 +132,7 @@ module Zorki
|
|
131
132
|
# If this is a page that has not been marked as misinfo we can just pull the data
|
132
133
|
# TODO: put this before the whole load loop
|
133
134
|
if response_body.nil?
|
135
|
+
|
134
136
|
doc = Nokogiri::HTML(page.driver.browser.page_source)
|
135
137
|
# elements = doc.search("script").find_all do |e|
|
136
138
|
# e.attributes.has_key?("type") && e.attributes["type"].value == "application/ld+json"
|
@@ -142,13 +144,13 @@ module Zorki
|
|
142
144
|
element_json = OJ.load(element.text)
|
143
145
|
|
144
146
|
# if element.text.include?("jokoy.komi.io")
|
145
|
-
|
146
|
-
|
147
|
+
# debugger
|
148
|
+
# if element_json["require"].first.last.first["__bbox"].key?("require")
|
147
149
|
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
150
|
+
# element_json["require"].first.last.first["__bbox"]["require"].each do |x|
|
151
|
+
# debugger if x.to_s.include?("Si mulut pelaut")
|
152
|
+
# end
|
153
|
+
# end
|
152
154
|
# end
|
153
155
|
|
154
156
|
parsed_element_json = element_json["require"].last.last.first["__bbox"]["require"].first.last.last["__bbox"]["result"]["data"]["xdt_api__v1__media__shortcode__web_info"]
|
@@ -24,13 +24,15 @@ module Zorki
|
|
24
24
|
begin
|
25
25
|
login
|
26
26
|
|
27
|
-
|
27
|
+
# This is searching for a specific request, the reason it's weird is because it's uri encoded
|
28
|
+
# graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql/query", "data,user,media_count", post_data_include: "render_surface%22%3A%22PROFILE")
|
29
|
+
graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql/query", nil, post_data_include: "render_surface%22%3A%22PROFILE")
|
28
30
|
graphql_script = graphql_script.first if graphql_script.class == Array
|
29
31
|
|
30
32
|
if graphql_script.nil?
|
31
33
|
graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "web_profile_info")
|
32
34
|
end
|
33
|
-
rescue Zorki::ContentUnavailableError
|
35
|
+
rescue Zorki::ContentUnavailableError
|
34
36
|
count += 1
|
35
37
|
|
36
38
|
if count > 3
|
@@ -100,8 +102,7 @@ module Zorki
|
|
100
102
|
profile_image_url: profile_image_url
|
101
103
|
}
|
102
104
|
end
|
103
|
-
rescue Zorki::ContentUnavailableError
|
104
|
-
debugger
|
105
|
+
rescue Zorki::ContentUnavailableError
|
105
106
|
raise Zorki::UserScrapingError.new("Zorki could not find user #{username}", additional_data: { username: username })
|
106
107
|
end
|
107
108
|
end
|
data/lib/zorki/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zorki
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.29
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Christopher Guess
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-10-
|
11
|
+
date: 2024-10-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|