zorki 0.1.27 → 0.1.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/zorki/scrapers/post_scraper.rb +4 -1
- data/lib/zorki/scrapers/scraper.rb +12 -10
- data/lib/zorki/scrapers/user_scraper.rb +5 -4
- data/lib/zorki/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 684dbf1fee604da4f2ab9687223bf3aea6f1ebad0a4f9e16ef7c80dafa645ac0
|
4
|
+
data.tar.gz: ab628433652062674b97318864663082cf5c7e623d32ff4ec87494d54df79788
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cfec123b6b4f1abb61c77c963b4162a9ef87273e3c3bc1dc7a67be34a5af5c38dbc70f4a43d3ae5690a3d5655a54d3081545352eb0768b926d598a575c1c3654
|
7
|
+
data.tar.gz: d7cc5599d995dcacc5d38dc60669f67f75d6005f282d9013ceb12bdb463b87ddf78eb9a156bf5e52ad0d3f464f1adea89abc2cc0ae371b4cbcbb981b72124593
|
@@ -40,6 +40,9 @@ module Zorki
|
|
40
40
|
Capybara.app_host = "https://instagram.com"
|
41
41
|
|
42
42
|
# video slideshows https://www.instagram.com/p/CY7KxwYOFBS/?utm_source=ig_embed&utm_campaign=loading
|
43
|
+
#
|
44
|
+
# TODO: Check if post is available publically before trying to login
|
45
|
+
# Should help with the scraping
|
43
46
|
login
|
44
47
|
graphql_object = get_content_of_subpage_from_url(
|
45
48
|
"https://www.instagram.com/p/#{id}/",
|
@@ -47,7 +50,6 @@ module Zorki
|
|
47
50
|
"data,xdt_api__v1__media__shortcode__web_info,items"
|
48
51
|
)
|
49
52
|
|
50
|
-
|
51
53
|
graphql_object = graphql_object.first if graphql_object.kind_of?(Array)
|
52
54
|
|
53
55
|
# For pages that have been marked misinfo the structure is very different than not
|
@@ -149,6 +151,7 @@ module Zorki
|
|
149
151
|
end
|
150
152
|
|
151
153
|
# Take the screenshot and return it
|
154
|
+
# rubocop:disable Link/Debugger
|
152
155
|
save_screenshot("#{Zorki.temp_storage_location}/instagram_screenshot_#{SecureRandom.uuid}.png")
|
153
156
|
end
|
154
157
|
end
|
@@ -70,13 +70,10 @@ module Zorki
|
|
70
70
|
# the one we want, and then moves on.
|
71
71
|
response_body = nil
|
72
72
|
|
73
|
-
responses = []
|
74
73
|
page.driver.browser.intercept do |request, &continue|
|
75
74
|
# This passes the request forward unmodified, since we only care about the response
|
76
|
-
#
|
77
|
-
# responses.first.post_data.include?("render_surface%22%3A%22PROFILE")
|
78
75
|
continue.call(request) && next unless request.url.include?(subpage_search)
|
79
|
-
continue.call(request) && next unless !post_data_include.nil? && request.post_data
|
76
|
+
continue.call(request) && next unless !post_data_include.nil? && request.post_data&.include?(post_data_include)
|
80
77
|
|
81
78
|
continue.call(request) do |response|
|
82
79
|
puts "***********************************************************"
|
@@ -113,6 +110,10 @@ module Zorki
|
|
113
110
|
end
|
114
111
|
rescue Selenium::WebDriver::Error::WebDriverError
|
115
112
|
# Eat them
|
113
|
+
rescue StandardError => e
|
114
|
+
puts "***********************************************************"
|
115
|
+
puts "Error in intercept: #{e}"
|
116
|
+
puts "***********************************************************"
|
116
117
|
end
|
117
118
|
|
118
119
|
# Now that the intercept is set up, we visit the page we want
|
@@ -131,6 +132,7 @@ module Zorki
|
|
131
132
|
# If this is a page that has not been marked as misinfo we can just pull the data
|
132
133
|
# TODO: put this before the whole load loop
|
133
134
|
if response_body.nil?
|
135
|
+
|
134
136
|
doc = Nokogiri::HTML(page.driver.browser.page_source)
|
135
137
|
# elements = doc.search("script").find_all do |e|
|
136
138
|
# e.attributes.has_key?("type") && e.attributes["type"].value == "application/ld+json"
|
@@ -142,13 +144,13 @@ module Zorki
|
|
142
144
|
element_json = OJ.load(element.text)
|
143
145
|
|
144
146
|
# if element.text.include?("jokoy.komi.io")
|
145
|
-
|
146
|
-
|
147
|
+
# debugger
|
148
|
+
# if element_json["require"].first.last.first["__bbox"].key?("require")
|
147
149
|
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
150
|
+
# element_json["require"].first.last.first["__bbox"]["require"].each do |x|
|
151
|
+
# debugger if x.to_s.include?("Si mulut pelaut")
|
152
|
+
# end
|
153
|
+
# end
|
152
154
|
# end
|
153
155
|
|
154
156
|
parsed_element_json = element_json["require"].last.last.first["__bbox"]["require"].first.last.last["__bbox"]["result"]["data"]["xdt_api__v1__media__shortcode__web_info"]
|
@@ -24,13 +24,15 @@ module Zorki
|
|
24
24
|
begin
|
25
25
|
login
|
26
26
|
|
27
|
-
|
27
|
+
# This is searching for a specific request, the reason it's weird is because it's uri encoded
|
28
|
+
# graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql/query", "data,user,media_count", post_data_include: "render_surface%22%3A%22PROFILE")
|
29
|
+
graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql/query", nil, post_data_include: "render_surface%22%3A%22PROFILE")
|
28
30
|
graphql_script = graphql_script.first if graphql_script.class == Array
|
29
31
|
|
30
32
|
if graphql_script.nil?
|
31
33
|
graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "web_profile_info")
|
32
34
|
end
|
33
|
-
rescue Zorki::ContentUnavailableError
|
35
|
+
rescue Zorki::ContentUnavailableError
|
34
36
|
count += 1
|
35
37
|
|
36
38
|
if count > 3
|
@@ -100,8 +102,7 @@ module Zorki
|
|
100
102
|
profile_image_url: profile_image_url
|
101
103
|
}
|
102
104
|
end
|
103
|
-
rescue Zorki::ContentUnavailableError
|
104
|
-
debugger
|
105
|
+
rescue Zorki::ContentUnavailableError
|
105
106
|
raise Zorki::UserScrapingError.new("Zorki could not find user #{username}", additional_data: { username: username })
|
106
107
|
end
|
107
108
|
end
|
data/lib/zorki/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zorki
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.29
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Christopher Guess
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-10-
|
11
|
+
date: 2024-10-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|