zorki 0.1.24 → 0.1.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +50 -46
- data/lib/zorki/scrapers/scraper.rb +30 -9
- data/lib/zorki/scrapers/user_scraper.rb +31 -6
- data/lib/zorki/version.rb +1 -1
- data/lib/zorki.rb +2 -2
- data/zorki.gemspec +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c5472c0d436e13f2e8554b59051546fed9400ad793de71b9b2d546bb5bd02d08
|
4
|
+
data.tar.gz: d62650105cb0f41a48a93d4379e077a4c1b658e96ae13a30c1d8073f8f2e0546
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 84a98236f4ca36daf440a8aea29acec2fa6963508bae78f5ee7c4d92c2ffedf19ef8db4050deadaa5090ea770132d2a47c64a1bab87f52329bdf18dd31f4aa2e
|
7
|
+
data.tar.gz: e1b635b352163d08dc0ea9b5e74b3cb990a4f9a7d91ce29296ae2150692612c2a7a81fc9e04bfd33cedfd5c4dab7031e5f06a802a25032aed15036550f306328
|
data/Gemfile.lock
CHANGED
@@ -1,35 +1,36 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
zorki (0.1.
|
4
|
+
zorki (0.1.26)
|
5
5
|
apparition
|
6
6
|
capybara
|
7
7
|
oj
|
8
8
|
selenium-devtools
|
9
|
-
selenium-webdriver (~> 4.
|
9
|
+
selenium-webdriver (~> 4.24.0)
|
10
10
|
typhoeus
|
11
11
|
|
12
12
|
GEM
|
13
13
|
remote: https://rubygems.org/
|
14
14
|
specs:
|
15
|
-
activesupport (7.1
|
15
|
+
activesupport (7.2.1)
|
16
16
|
base64
|
17
17
|
bigdecimal
|
18
|
-
concurrent-ruby (~> 1.0, >= 1.
|
18
|
+
concurrent-ruby (~> 1.0, >= 1.3.1)
|
19
19
|
connection_pool (>= 2.2.5)
|
20
20
|
drb
|
21
21
|
i18n (>= 1.6, < 2)
|
22
|
+
logger (>= 1.4.2)
|
22
23
|
minitest (>= 5.1)
|
23
|
-
|
24
|
-
tzinfo (~> 2.0)
|
25
|
-
addressable (2.8.
|
26
|
-
public_suffix (>= 2.0.2, <
|
24
|
+
securerandom (>= 0.3)
|
25
|
+
tzinfo (~> 2.0, >= 2.0.5)
|
26
|
+
addressable (2.8.7)
|
27
|
+
public_suffix (>= 2.0.2, < 7.0)
|
27
28
|
apparition (0.6.0)
|
28
29
|
capybara (~> 3.13, < 4)
|
29
30
|
websocket-driver (>= 0.6.5)
|
30
31
|
ast (2.4.2)
|
31
32
|
base64 (0.2.0)
|
32
|
-
bigdecimal (3.1.
|
33
|
+
bigdecimal (3.1.8)
|
33
34
|
capybara (3.40.0)
|
34
35
|
addressable
|
35
36
|
matrix
|
@@ -39,75 +40,76 @@ GEM
|
|
39
40
|
rack-test (>= 0.6.3)
|
40
41
|
regexp_parser (>= 1.5, < 3.0)
|
41
42
|
xpath (~> 3.2)
|
42
|
-
concurrent-ruby (1.
|
43
|
+
concurrent-ruby (1.3.4)
|
43
44
|
connection_pool (2.4.1)
|
44
|
-
curb (1.0.
|
45
|
-
debug (1.9.
|
45
|
+
curb (1.0.6)
|
46
|
+
debug (1.9.2)
|
46
47
|
irb (~> 1.10)
|
47
48
|
reline (>= 0.3.8)
|
48
49
|
dotenv (2.7.6)
|
49
50
|
drb (2.2.1)
|
50
51
|
ethon (0.16.0)
|
51
52
|
ffi (>= 1.15.0)
|
52
|
-
ffi (1.
|
53
|
-
i18n (1.14.
|
53
|
+
ffi (1.17.0-arm64-darwin)
|
54
|
+
i18n (1.14.5)
|
54
55
|
concurrent-ruby (~> 1.0)
|
55
56
|
io-console (0.7.2)
|
56
|
-
irb (1.
|
57
|
-
rdoc
|
57
|
+
irb (1.14.0)
|
58
|
+
rdoc (>= 4.0.0)
|
58
59
|
reline (>= 0.4.2)
|
59
|
-
json (2.7.
|
60
|
+
json (2.7.2)
|
60
61
|
language_server-protocol (3.17.0.3)
|
62
|
+
logger (1.6.1)
|
61
63
|
matrix (0.4.2)
|
62
64
|
mini_mime (1.1.5)
|
63
|
-
minitest (5.
|
64
|
-
|
65
|
-
nokogiri (1.16.3-arm64-darwin)
|
65
|
+
minitest (5.25.1)
|
66
|
+
nokogiri (1.16.7-arm64-darwin)
|
66
67
|
racc (~> 1.4)
|
67
|
-
oj (3.16.
|
68
|
+
oj (3.16.6)
|
68
69
|
bigdecimal (>= 3.0)
|
69
|
-
|
70
|
-
|
70
|
+
ostruct (>= 0.2)
|
71
|
+
ostruct (0.6.0)
|
72
|
+
parallel (1.26.3)
|
73
|
+
parser (3.3.5.0)
|
71
74
|
ast (~> 2.4.1)
|
72
75
|
racc
|
73
76
|
psych (5.1.2)
|
74
77
|
stringio
|
75
|
-
public_suffix (
|
76
|
-
racc (1.
|
78
|
+
public_suffix (6.0.1)
|
79
|
+
racc (1.8.1)
|
77
80
|
rack (3.0.8)
|
78
81
|
rack-test (2.1.0)
|
79
82
|
rack (>= 1.3)
|
80
83
|
rainbow (3.1.1)
|
81
|
-
rake (13.1
|
82
|
-
rdoc (6.
|
84
|
+
rake (13.2.1)
|
85
|
+
rdoc (6.7.0)
|
83
86
|
psych (>= 4.0.0)
|
84
|
-
regexp_parser (2.9.
|
85
|
-
reline (0.5.
|
87
|
+
regexp_parser (2.9.2)
|
88
|
+
reline (0.5.10)
|
86
89
|
io-console (~> 0.5)
|
87
|
-
rexml (3.
|
88
|
-
rubocop (1.
|
90
|
+
rexml (3.3.7)
|
91
|
+
rubocop (1.66.1)
|
89
92
|
json (~> 2.3)
|
90
93
|
language_server-protocol (>= 3.17.0)
|
91
94
|
parallel (~> 1.10)
|
92
95
|
parser (>= 3.3.0.2)
|
93
96
|
rainbow (>= 2.2.2, < 4.0)
|
94
|
-
regexp_parser (>=
|
95
|
-
|
96
|
-
rubocop-ast (>= 1.31.1, < 2.0)
|
97
|
+
regexp_parser (>= 2.4, < 3.0)
|
98
|
+
rubocop-ast (>= 1.32.2, < 2.0)
|
97
99
|
ruby-progressbar (~> 1.7)
|
98
100
|
unicode-display_width (>= 2.4.0, < 3.0)
|
99
|
-
rubocop-ast (1.
|
100
|
-
parser (>= 3.3.0
|
101
|
-
rubocop-md (1.2.
|
102
|
-
rubocop (>= 1.
|
103
|
-
rubocop-minitest (0.
|
101
|
+
rubocop-ast (1.32.3)
|
102
|
+
parser (>= 3.3.1.0)
|
103
|
+
rubocop-md (1.2.3)
|
104
|
+
rubocop (>= 1.45)
|
105
|
+
rubocop-minitest (0.36.0)
|
104
106
|
rubocop (>= 1.61, < 2.0)
|
105
107
|
rubocop-ast (>= 1.31.1, < 2.0)
|
106
108
|
rubocop-packaging (0.5.2)
|
107
109
|
rubocop (>= 1.33, < 2.0)
|
108
|
-
rubocop-performance (1.
|
110
|
+
rubocop-performance (1.21.1)
|
109
111
|
rubocop (>= 1.48.1, < 2.0)
|
110
|
-
rubocop-ast (>= 1.
|
112
|
+
rubocop-ast (>= 1.31.1, < 2.0)
|
111
113
|
rubocop-rails (2.19.1)
|
112
114
|
activesupport (>= 4.2.0)
|
113
115
|
rack (>= 1.1)
|
@@ -122,20 +124,22 @@ GEM
|
|
122
124
|
rubocop-rails (~> 2.0)
|
123
125
|
ruby-progressbar (1.13.0)
|
124
126
|
rubyzip (2.3.2)
|
125
|
-
|
127
|
+
securerandom (0.3.1)
|
128
|
+
selenium-devtools (0.128.0)
|
126
129
|
selenium-webdriver (~> 4.2)
|
127
|
-
selenium-webdriver (4.
|
130
|
+
selenium-webdriver (4.24.0)
|
128
131
|
base64 (~> 0.2)
|
132
|
+
logger (~> 1.4)
|
129
133
|
rexml (~> 3.2, >= 3.2.5)
|
130
134
|
rubyzip (>= 1.2.2, < 3.0)
|
131
135
|
websocket (~> 1.0)
|
132
|
-
stringio (3.1.
|
136
|
+
stringio (3.1.1)
|
133
137
|
typhoeus (1.4.1)
|
134
138
|
ethon (>= 0.9.0)
|
135
139
|
tzinfo (2.0.6)
|
136
140
|
concurrent-ruby (~> 1.0)
|
137
|
-
unicode-display_width (2.
|
138
|
-
websocket (1.2.
|
141
|
+
unicode-display_width (2.6.0)
|
142
|
+
websocket (1.2.11)
|
139
143
|
websocket-driver (0.7.6)
|
140
144
|
websocket-extensions (>= 0.1.0)
|
141
145
|
websocket-extensions (0.1.5)
|
@@ -51,8 +51,13 @@ module Zorki
|
|
51
51
|
# additional_search_params is a comma seperated keys
|
52
52
|
# example: `data,xdt_api__v1__media__shortcode__web_info,items`
|
53
53
|
#
|
54
|
+
# NOTE: `post_data_include` if not nil overrules the additional_search_parameters
|
55
|
+
# This is so that i didn't have to refactor the entire code base when I added it.
|
56
|
+
# Eventually it might be better to look at the post request and see if we can do the
|
57
|
+
# same type of search there as we use for users and simplify this whole thing a lot.
|
58
|
+
#
|
54
59
|
# @returns Hash a ruby hash of the JSON data
|
55
|
-
def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil)
|
60
|
+
def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil, post_data_include: nil)
|
56
61
|
# So this is fun:
|
57
62
|
# For pages marked as misinformation we have to use one method (interception of requrest) and
|
58
63
|
# for pages that are not, we can just pull the data straight from the page.
|
@@ -65,21 +70,28 @@ module Zorki
|
|
65
70
|
# the one we want, and then moves on.
|
66
71
|
response_body = nil
|
67
72
|
|
73
|
+
responses = []
|
68
74
|
page.driver.browser.intercept do |request, &continue|
|
69
75
|
# This passes the request forward unmodified, since we only care about the response
|
70
|
-
#
|
76
|
+
#
|
77
|
+
# responses.first.post_data.include?("render_surface%22%3A%22PROFILE")
|
71
78
|
continue.call(request) && next unless request.url.include?(subpage_search)
|
79
|
+
continue.call(request) && next unless !post_data_include.nil? && request.post_data.include?(post_data_include)
|
72
80
|
|
73
81
|
continue.call(request) do |response|
|
82
|
+
puts "***********************************************************"
|
83
|
+
puts "checking request: #{request.url}"
|
84
|
+
puts response.body
|
85
|
+
puts "***********************************************************"
|
86
|
+
|
87
|
+
# responses << response
|
74
88
|
# Check if not a CORS prefetch and finish up if not
|
75
89
|
if !response.body&.empty? && response.body
|
76
90
|
check_passed = true
|
77
91
|
|
78
|
-
|
92
|
+
if !additional_search_parameters.nil? && post_data_include.nil?
|
79
93
|
body_to_check = Oj.load(response.body)
|
80
94
|
|
81
|
-
debugger if body_to_check.include?("jokoy.komi.io")
|
82
|
-
|
83
95
|
search_parameters = additional_search_parameters.split(",")
|
84
96
|
search_parameters.each_with_index do |key, index|
|
85
97
|
break if body_to_check.nil?
|
@@ -89,6 +101,13 @@ module Zorki
|
|
89
101
|
end
|
90
102
|
end
|
91
103
|
|
104
|
+
if check_passed == false
|
105
|
+
puts "***********************************************************"
|
106
|
+
puts "checking FAILED request: #{request.url}"
|
107
|
+
puts response.body
|
108
|
+
puts "***********************************************************"
|
109
|
+
end
|
110
|
+
|
92
111
|
response_body = response.body if check_passed == true
|
93
112
|
end
|
94
113
|
end
|
@@ -120,17 +139,17 @@ module Zorki
|
|
120
139
|
elements = doc.search("script").filter_map do |element|
|
121
140
|
parsed_element_json = nil
|
122
141
|
begin
|
123
|
-
element_json =
|
142
|
+
element_json = OJ.load(element.text)
|
124
143
|
|
125
|
-
if element.text.include?("jokoy.komi.io")
|
126
|
-
debugger
|
144
|
+
# if element.text.include?("jokoy.komi.io")
|
145
|
+
# debugger
|
127
146
|
# if element_json["require"].first.last.first["__bbox"].key?("require")
|
128
147
|
|
129
148
|
# element_json["require"].first.last.first["__bbox"]["require"].each do |x|
|
130
149
|
# debugger if x.to_s.include?("Si mulut pelaut")
|
131
150
|
# end
|
132
151
|
# end
|
133
|
-
end
|
152
|
+
# end
|
134
153
|
|
135
154
|
parsed_element_json = element_json["require"].last.last.first["__bbox"]["require"].first.last.last["__bbox"]["result"]["data"]["xdt_api__v1__media__shortcode__web_info"]
|
136
155
|
rescue StandardError
|
@@ -141,12 +160,14 @@ module Zorki
|
|
141
160
|
end
|
142
161
|
|
143
162
|
if elements&.empty?
|
163
|
+
# debugger
|
144
164
|
raise ContentUnavailableError.new("Cannot find anything", additional_data: { page_source: page.driver.browser.page_source, elements: elements })
|
145
165
|
end
|
146
166
|
|
147
167
|
return elements
|
148
168
|
end
|
149
169
|
|
170
|
+
# debugger if response_body.nil?
|
150
171
|
raise ContentUnavailableError.new("Response body nil") if response_body.nil?
|
151
172
|
Oj.load(response_body)
|
152
173
|
ensure
|
@@ -16,13 +16,33 @@ module Zorki
|
|
16
16
|
# - *description
|
17
17
|
# - *links
|
18
18
|
# - *Profile image
|
19
|
-
login
|
20
19
|
|
21
|
-
graphql_script =
|
22
|
-
|
20
|
+
graphql_script = nil
|
21
|
+
count = 0
|
22
|
+
loop do
|
23
|
+
print "Scraping user #{username}... (attempt #{count + 1})\n"
|
24
|
+
begin
|
25
|
+
login
|
26
|
+
|
27
|
+
graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql/query", "data,user,media_count", post_data_include: "render_surface")
|
28
|
+
graphql_script = graphql_script.first if graphql_script.class == Array
|
29
|
+
|
30
|
+
if graphql_script.nil?
|
31
|
+
graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "web_profile_info")
|
32
|
+
end
|
33
|
+
rescue Zorki::ContentUnavailableError => e
|
34
|
+
count += 1
|
35
|
+
|
36
|
+
if count > 3
|
37
|
+
raise Zorki::UserScrapingError.new("Zorki could not find user #{username}", additional_data: { username: username })
|
38
|
+
end
|
23
39
|
|
24
|
-
|
25
|
-
|
40
|
+
page.driver.browser.navigate.to("https://www.instagram.com") # we want to go back to the main page so we start from scratch
|
41
|
+
sleep rand(5..10)
|
42
|
+
next
|
43
|
+
end
|
44
|
+
|
45
|
+
break
|
26
46
|
end
|
27
47
|
|
28
48
|
if graphql_script.has_key?("author") && !graphql_script["author"].nil?
|
@@ -36,10 +56,14 @@ module Zorki
|
|
36
56
|
["https://schema.org/FilmAction", "http://schema.org/WriteAction"].include?(stat["interactionType"])
|
37
57
|
end.first
|
38
58
|
|
59
|
+
# number_of_posts = graphql_script["data"]["user"]["media_count"] if number_of_posts.nil?
|
60
|
+
|
39
61
|
number_of_followers = graphql_script["interactionStatistic"].select do |stat|
|
40
62
|
stat["interactionType"] == "http://schema.org/FollowAction"
|
41
63
|
end.first
|
42
64
|
|
65
|
+
# number_of_followers = graphql_script["data"]["user"]["follower_count"] if number_of_followers.nil?
|
66
|
+
|
43
67
|
begin
|
44
68
|
profile_image_url = user["image"]
|
45
69
|
{
|
@@ -76,7 +100,8 @@ module Zorki
|
|
76
100
|
profile_image_url: profile_image_url
|
77
101
|
}
|
78
102
|
end
|
79
|
-
rescue Zorki::ContentUnavailableError
|
103
|
+
rescue Zorki::ContentUnavailableError => e
|
104
|
+
debugger
|
80
105
|
raise Zorki::UserScrapingError.new("Zorki could not find user #{username}", additional_data: { username: username })
|
81
106
|
end
|
82
107
|
end
|
data/lib/zorki/version.rb
CHANGED
data/lib/zorki.rb
CHANGED
data/zorki.gemspec
CHANGED
@@ -33,7 +33,7 @@ Gem::Specification.new do |spec|
|
|
33
33
|
spec.add_dependency "apparition" # A Chrome driver for Capybara
|
34
34
|
spec.add_dependency "typhoeus" # For making API requests
|
35
35
|
spec.add_dependency "oj" # A faster JSON parser/loader than stdlib
|
36
|
-
spec.add_dependency "selenium-webdriver", "~> 4.
|
36
|
+
spec.add_dependency "selenium-webdriver", "~> 4.24.0" # Webdriver selenium
|
37
37
|
spec.add_dependency "selenium-devtools" # Allow us to intercept requests
|
38
38
|
|
39
39
|
spec.add_development_dependency "debug"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zorki
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.27
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Christopher Guess
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-10-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|
@@ -72,14 +72,14 @@ dependencies:
|
|
72
72
|
requirements:
|
73
73
|
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version:
|
75
|
+
version: 4.24.0
|
76
76
|
type: :runtime
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version:
|
82
|
+
version: 4.24.0
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
name: selenium-devtools
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|