zorki 0.1.26 → 0.1.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/zorki/scrapers/scraper.rb +28 -9
- data/lib/zorki/scrapers/user_scraper.rb +7 -4
- data/lib/zorki/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c5472c0d436e13f2e8554b59051546fed9400ad793de71b9b2d546bb5bd02d08
|
4
|
+
data.tar.gz: d62650105cb0f41a48a93d4379e077a4c1b658e96ae13a30c1d8073f8f2e0546
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 84a98236f4ca36daf440a8aea29acec2fa6963508bae78f5ee7c4d92c2ffedf19ef8db4050deadaa5090ea770132d2a47c64a1bab87f52329bdf18dd31f4aa2e
|
7
|
+
data.tar.gz: e1b635b352163d08dc0ea9b5e74b3cb990a4f9a7d91ce29296ae2150692612c2a7a81fc9e04bfd33cedfd5c4dab7031e5f06a802a25032aed15036550f306328
|
data/Gemfile.lock
CHANGED
@@ -51,8 +51,13 @@ module Zorki
|
|
51
51
|
# additional_search_params is a comma seperated keys
|
52
52
|
# example: `data,xdt_api__v1__media__shortcode__web_info,items`
|
53
53
|
#
|
54
|
+
# NOTE: `post_data_include` if not nil overrules the additional_search_parameters
|
55
|
+
# This is so that i didn't have to refactor the entire code base when I added it.
|
56
|
+
# Eventually it might be better to look at the post request and see if we can do the
|
57
|
+
# same type of search there as we use for users and simplify this whole thing a lot.
|
58
|
+
#
|
54
59
|
# @returns Hash a ruby hash of the JSON data
|
55
|
-
def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil)
|
60
|
+
def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil, post_data_include: nil)
|
56
61
|
# So this is fun:
|
57
62
|
# For pages marked as misinformation we have to use one method (interception of requrest) and
|
58
63
|
# for pages that are not, we can just pull the data straight from the page.
|
@@ -65,21 +70,28 @@ module Zorki
|
|
65
70
|
# the one we want, and then moves on.
|
66
71
|
response_body = nil
|
67
72
|
|
73
|
+
responses = []
|
68
74
|
page.driver.browser.intercept do |request, &continue|
|
69
75
|
# This passes the request forward unmodified, since we only care about the response
|
70
|
-
#
|
76
|
+
#
|
77
|
+
# responses.first.post_data.include?("render_surface%22%3A%22PROFILE")
|
71
78
|
continue.call(request) && next unless request.url.include?(subpage_search)
|
79
|
+
continue.call(request) && next unless !post_data_include.nil? && request.post_data.include?(post_data_include)
|
72
80
|
|
73
81
|
continue.call(request) do |response|
|
82
|
+
puts "***********************************************************"
|
83
|
+
puts "checking request: #{request.url}"
|
84
|
+
puts response.body
|
85
|
+
puts "***********************************************************"
|
86
|
+
|
87
|
+
# responses << response
|
74
88
|
# Check if not a CORS prefetch and finish up if not
|
75
89
|
if !response.body&.empty? && response.body
|
76
90
|
check_passed = true
|
77
91
|
|
78
|
-
|
92
|
+
if !additional_search_parameters.nil? && post_data_include.nil?
|
79
93
|
body_to_check = Oj.load(response.body)
|
80
94
|
|
81
|
-
debugger if body_to_check.include?("jokoy.komi.io")
|
82
|
-
|
83
95
|
search_parameters = additional_search_parameters.split(",")
|
84
96
|
search_parameters.each_with_index do |key, index|
|
85
97
|
break if body_to_check.nil?
|
@@ -89,6 +101,13 @@ module Zorki
|
|
89
101
|
end
|
90
102
|
end
|
91
103
|
|
104
|
+
if check_passed == false
|
105
|
+
puts "***********************************************************"
|
106
|
+
puts "checking FAILED request: #{request.url}"
|
107
|
+
puts response.body
|
108
|
+
puts "***********************************************************"
|
109
|
+
end
|
110
|
+
|
92
111
|
response_body = response.body if check_passed == true
|
93
112
|
end
|
94
113
|
end
|
@@ -120,17 +139,17 @@ module Zorki
|
|
120
139
|
elements = doc.search("script").filter_map do |element|
|
121
140
|
parsed_element_json = nil
|
122
141
|
begin
|
123
|
-
element_json =
|
142
|
+
element_json = OJ.load(element.text)
|
124
143
|
|
125
|
-
if element.text.include?("jokoy.komi.io")
|
126
|
-
debugger
|
144
|
+
# if element.text.include?("jokoy.komi.io")
|
145
|
+
# debugger
|
127
146
|
# if element_json["require"].first.last.first["__bbox"].key?("require")
|
128
147
|
|
129
148
|
# element_json["require"].first.last.first["__bbox"]["require"].each do |x|
|
130
149
|
# debugger if x.to_s.include?("Si mulut pelaut")
|
131
150
|
# end
|
132
151
|
# end
|
133
|
-
end
|
152
|
+
# end
|
134
153
|
|
135
154
|
parsed_element_json = element_json["require"].last.last.first["__bbox"]["require"].first.last.last["__bbox"]["result"]["data"]["xdt_api__v1__media__shortcode__web_info"]
|
136
155
|
rescue StandardError
|
@@ -20,13 +20,11 @@ module Zorki
|
|
20
20
|
graphql_script = nil
|
21
21
|
count = 0
|
22
22
|
loop do
|
23
|
-
raise Zorki::UserScrapingError.new("Zorki could not find user #{username}", additional_data: { username: username }) if count > 3
|
24
|
-
|
25
23
|
print "Scraping user #{username}... (attempt #{count + 1})\n"
|
26
24
|
begin
|
27
25
|
login
|
28
26
|
|
29
|
-
graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql", "data,user,media_count")
|
27
|
+
graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql/query", "data,user,media_count", post_data_include: "render_surface")
|
30
28
|
graphql_script = graphql_script.first if graphql_script.class == Array
|
31
29
|
|
32
30
|
if graphql_script.nil?
|
@@ -34,8 +32,13 @@ module Zorki
|
|
34
32
|
end
|
35
33
|
rescue Zorki::ContentUnavailableError => e
|
36
34
|
count += 1
|
35
|
+
|
36
|
+
if count > 3
|
37
|
+
raise Zorki::UserScrapingError.new("Zorki could not find user #{username}", additional_data: { username: username })
|
38
|
+
end
|
39
|
+
|
37
40
|
page.driver.browser.navigate.to("https://www.instagram.com") # we want to go back to the main page so we start from scratch
|
38
|
-
sleep 10
|
41
|
+
sleep rand(5..10)
|
39
42
|
next
|
40
43
|
end
|
41
44
|
|
data/lib/zorki/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zorki
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.27
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Christopher Guess
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-10-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|