zorki 0.1.1 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/Gemfile.lock +31 -27
- data/lib/zorki/scrapers/post_scraper.rb +5 -3
- data/lib/zorki/scrapers/scraper.rb +15 -6
- data/lib/zorki/scrapers/user_scraper.rb +17 -14
- data/lib/zorki/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8e9aca1027df18f607fac0616a75a4b8ac11728cbac5de9205f014c57306f82e
|
4
|
+
data.tar.gz: c22375fd87090060642780ae2ec472d505f66c7ed1579d35d8eb0e9fe02bd976
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b3e3117ca0903a23fd2303ec938d70d1d8dd8f82eeb510019832e25e286a4c5ae665c707cc62523dedf9617de9239072aa02d8877fb8e2fae2ea4f85ad14bbc9
|
7
|
+
data.tar.gz: 7a2b9d079041484f5553bf5be6fa391bbeb20c8660fb07df69cd7b6feb28d8f7d315e22353d89ff1ccedf7304e39dfbab9f8f18e9f25ad44cd7067e473d515d1
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
zorki (0.1.
|
4
|
+
zorki (0.1.2)
|
5
5
|
apparition
|
6
6
|
capybara
|
7
7
|
oj
|
@@ -12,20 +12,20 @@ PATH
|
|
12
12
|
GEM
|
13
13
|
remote: https://rubygems.org/
|
14
14
|
specs:
|
15
|
-
actionpack (7.0.
|
16
|
-
actionview (= 7.0.
|
17
|
-
activesupport (= 7.0.
|
18
|
-
rack (~> 2.0, >= 2.2.
|
15
|
+
actionpack (7.0.5)
|
16
|
+
actionview (= 7.0.5)
|
17
|
+
activesupport (= 7.0.5)
|
18
|
+
rack (~> 2.0, >= 2.2.4)
|
19
19
|
rack-test (>= 0.6.3)
|
20
20
|
rails-dom-testing (~> 2.0)
|
21
21
|
rails-html-sanitizer (~> 1.0, >= 1.2.0)
|
22
|
-
actionview (7.0.
|
23
|
-
activesupport (= 7.0.
|
22
|
+
actionview (7.0.5)
|
23
|
+
activesupport (= 7.0.5)
|
24
24
|
builder (~> 3.1)
|
25
25
|
erubi (~> 1.4)
|
26
26
|
rails-dom-testing (~> 2.0)
|
27
27
|
rails-html-sanitizer (~> 1.1, >= 1.2.0)
|
28
|
-
activesupport (7.0.
|
28
|
+
activesupport (7.0.5)
|
29
29
|
concurrent-ruby (~> 1.0, >= 1.0.2)
|
30
30
|
i18n (>= 1.6, < 2)
|
31
31
|
minitest (>= 5.1)
|
@@ -37,7 +37,7 @@ GEM
|
|
37
37
|
websocket-driver (>= 0.6.5)
|
38
38
|
ast (2.4.2)
|
39
39
|
builder (3.2.4)
|
40
|
-
capybara (3.39.
|
40
|
+
capybara (3.39.2)
|
41
41
|
addressable
|
42
42
|
matrix
|
43
43
|
mini_mime (>= 0.1.3)
|
@@ -48,59 +48,62 @@ GEM
|
|
48
48
|
xpath (~> 3.2)
|
49
49
|
concurrent-ruby (1.2.2)
|
50
50
|
crass (1.0.6)
|
51
|
+
curb (1.0.5)
|
51
52
|
dotenv (2.7.6)
|
52
53
|
erubi (1.12.0)
|
53
54
|
ethon (0.16.0)
|
54
55
|
ffi (>= 1.15.0)
|
55
56
|
ffi (1.15.5)
|
56
|
-
i18n (1.
|
57
|
+
i18n (1.14.1)
|
57
58
|
concurrent-ruby (~> 1.0)
|
58
59
|
json (2.6.3)
|
59
|
-
loofah (2.21.
|
60
|
+
loofah (2.21.3)
|
60
61
|
crass (~> 1.0.2)
|
61
62
|
nokogiri (>= 1.12.0)
|
62
63
|
matrix (0.4.2)
|
63
64
|
method_source (1.0.0)
|
64
65
|
mini_mime (1.1.2)
|
65
66
|
minitest (5.18.0)
|
66
|
-
nokogiri (1.
|
67
|
+
nokogiri (1.15.2-arm64-darwin)
|
67
68
|
racc (~> 1.4)
|
68
|
-
oj (3.
|
69
|
+
oj (3.15.0)
|
69
70
|
parallel (1.23.0)
|
70
|
-
parser (3.2.2.
|
71
|
+
parser (3.2.2.3)
|
71
72
|
ast (~> 2.4.1)
|
73
|
+
racc
|
72
74
|
public_suffix (5.0.1)
|
73
|
-
racc (1.
|
75
|
+
racc (1.7.0)
|
74
76
|
rack (2.2.7)
|
75
77
|
rack-test (2.1.0)
|
76
78
|
rack (>= 1.3)
|
77
79
|
rails-dom-testing (2.0.3)
|
78
80
|
activesupport (>= 4.2.0)
|
79
81
|
nokogiri (>= 1.6)
|
80
|
-
rails-html-sanitizer (1.
|
81
|
-
loofah (~> 2.
|
82
|
-
|
83
|
-
|
84
|
-
|
82
|
+
rails-html-sanitizer (1.6.0)
|
83
|
+
loofah (~> 2.21)
|
84
|
+
nokogiri (~> 1.14)
|
85
|
+
railties (7.0.5)
|
86
|
+
actionpack (= 7.0.5)
|
87
|
+
activesupport (= 7.0.5)
|
85
88
|
method_source
|
86
89
|
rake (>= 12.2)
|
87
90
|
thor (~> 1.0)
|
88
91
|
zeitwerk (~> 2.5)
|
89
92
|
rainbow (3.1.1)
|
90
93
|
rake (13.0.6)
|
91
|
-
regexp_parser (2.8.
|
94
|
+
regexp_parser (2.8.1)
|
92
95
|
rexml (3.2.5)
|
93
|
-
rubocop (1.
|
96
|
+
rubocop (1.52.1)
|
94
97
|
json (~> 2.3)
|
95
98
|
parallel (~> 1.10)
|
96
|
-
parser (>= 3.2.
|
99
|
+
parser (>= 3.2.2.3)
|
97
100
|
rainbow (>= 2.2.2, < 4.0)
|
98
101
|
regexp_parser (>= 1.8, < 3.0)
|
99
102
|
rexml (>= 3.2.5, < 4.0)
|
100
103
|
rubocop-ast (>= 1.28.0, < 2.0)
|
101
104
|
ruby-progressbar (~> 1.7)
|
102
105
|
unicode-display_width (>= 2.4.0, < 3.0)
|
103
|
-
rubocop-ast (1.
|
106
|
+
rubocop-ast (1.29.0)
|
104
107
|
parser (>= 3.2.1.0)
|
105
108
|
rubocop-md (1.2.0)
|
106
109
|
rubocop (>= 1.0)
|
@@ -108,7 +111,7 @@ GEM
|
|
108
111
|
rubocop (>= 1.39, < 2.0)
|
109
112
|
rubocop-packaging (0.5.2)
|
110
113
|
rubocop (>= 1.33, < 2.0)
|
111
|
-
rubocop-performance (1.
|
114
|
+
rubocop-performance (1.18.0)
|
112
115
|
rubocop (>= 1.7.0, < 2.0)
|
113
116
|
rubocop-ast (>= 0.4.0)
|
114
117
|
rubocop-rails (2.19.1)
|
@@ -126,9 +129,9 @@ GEM
|
|
126
129
|
rubocop-rails (~> 2.0)
|
127
130
|
ruby-progressbar (1.13.0)
|
128
131
|
rubyzip (2.3.2)
|
129
|
-
selenium-devtools (0.
|
132
|
+
selenium-devtools (0.114.0)
|
130
133
|
selenium-webdriver (~> 4.2)
|
131
|
-
selenium-webdriver (4.
|
134
|
+
selenium-webdriver (4.10.0)
|
132
135
|
rexml (~> 3.2, >= 3.2.5)
|
133
136
|
rubyzip (>= 1.2.2, < 3.0)
|
134
137
|
websocket (~> 1.0)
|
@@ -150,6 +153,7 @@ PLATFORMS
|
|
150
153
|
arm64-darwin-22
|
151
154
|
|
152
155
|
DEPENDENCIES
|
156
|
+
curb (~> 1.0, >= 1.0.5)
|
153
157
|
dotenv (~> 2.7.6)
|
154
158
|
minitest (~> 5.0)
|
155
159
|
rake (~> 13.0)
|
@@ -45,15 +45,17 @@ module Zorki
|
|
45
45
|
end.first["userInteractionCount"]
|
46
46
|
|
47
47
|
unless graphql_object["video"].empty?
|
48
|
-
|
49
|
-
|
48
|
+
video_url = graphql_object["video"].first["contentUrl"]
|
49
|
+
video = Zorki.retrieve_media(video_url)
|
50
|
+
|
51
|
+
video_preview_image_url = graphql_object["video"].first["thumbnailUrl"]
|
52
|
+
video_preview_image = Zorki.retrieve_media(video_preview_image_url)
|
50
53
|
end
|
51
54
|
else
|
52
55
|
# We need to see if this is a single image post or a slideshow. We do that
|
53
56
|
# by looking for a single image, if it's not there, we assume the alternative.
|
54
57
|
graphql_object = graphql_object["data"]["xdt_api__v1__media__shortcode__web_info"]
|
55
58
|
|
56
|
-
|
57
59
|
unless graphql_object["items"][0].has_key?("video_versions") && !graphql_object["items"][0]["video_versions"].nil?
|
58
60
|
# Check if there is a slideshow or not
|
59
61
|
unless graphql_object["items"][0].has_key?("carousel_media") && !graphql_object["items"][0]["carousel_media"].nil?
|
@@ -7,6 +7,7 @@ require "selenium-webdriver"
|
|
7
7
|
require "logger"
|
8
8
|
require "debug"
|
9
9
|
require "securerandom"
|
10
|
+
require "selenium/webdriver/remote/http/curb"
|
10
11
|
|
11
12
|
# 2022-06-07 14:15:23 WARN Selenium [DEPRECATION] [:browser_options] :options as a parameter for driver initialization is deprecated. Use :capabilities with an Array of value capabilities/options if necessary instead.
|
12
13
|
|
@@ -16,13 +17,14 @@ options.add_argument("--no-sandbox")
|
|
16
17
|
options.add_argument("--disable-dev-shm-usage")
|
17
18
|
options.add_argument("–-disable-blink-features=AutomationControlled")
|
18
19
|
options.add_argument("--disable-extensions")
|
20
|
+
options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
|
19
21
|
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
|
20
22
|
options.add_preference "password_manager_enabled", false
|
21
23
|
options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
|
22
24
|
|
23
25
|
Capybara.register_driver :selenium_zorki do |app|
|
24
|
-
client = Selenium::WebDriver::Remote::Http::
|
25
|
-
client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
26
|
+
client = Selenium::WebDriver::Remote::Http::Curb.new
|
27
|
+
# client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
26
28
|
Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
|
27
29
|
end
|
28
30
|
|
@@ -94,7 +96,7 @@ module Zorki
|
|
94
96
|
end
|
95
97
|
|
96
98
|
# Now that the intercept is set up, we visit the page we want
|
97
|
-
|
99
|
+
page.driver.browser.navigate.to(url)
|
98
100
|
# We wait until the correct intercept is processed or we've waited 60 seconds
|
99
101
|
start_time = Time.now
|
100
102
|
# puts "Waiting.... #{url}"
|
@@ -134,14 +136,16 @@ module Zorki
|
|
134
136
|
options.add_argument("--disable-dev-shm-usage")
|
135
137
|
options.add_argument("–-disable-blink-features=AutomationControlled")
|
136
138
|
options.add_argument("--disable-extensions")
|
139
|
+
options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
|
140
|
+
|
137
141
|
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
|
138
142
|
options.add_preference "password_manager_enabled", false
|
139
143
|
options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
|
140
144
|
# options.add_argument("--user-data-dir=/tmp/tarun")
|
141
145
|
|
142
146
|
Capybara.register_driver :selenium do |app|
|
143
|
-
client = Selenium::WebDriver::Remote::Http::
|
144
|
-
client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
147
|
+
client = Selenium::WebDriver::Remote::Http::Curb.new
|
148
|
+
# client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
145
149
|
Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
|
146
150
|
end
|
147
151
|
|
@@ -153,7 +157,12 @@ module Zorki
|
|
153
157
|
page.quit
|
154
158
|
|
155
159
|
# Check if we're on a Instagram page already, if not visit it.
|
156
|
-
|
160
|
+
unless page.driver.browser.current_url.include? "instagram.com"
|
161
|
+
# There seems to be a bug in the Linux ARM64 version of chromedriver where this will properly
|
162
|
+
# navigate but then timeout, crashing it all up. So instead we check and raise the error when
|
163
|
+
# that then fails again.
|
164
|
+
page.driver.browser.navigate.to("https://instagram.com")
|
165
|
+
end
|
157
166
|
|
158
167
|
# We don't have to login if we already are
|
159
168
|
begin
|
@@ -19,6 +19,7 @@ module Zorki
|
|
19
19
|
login
|
20
20
|
|
21
21
|
graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "?username=")
|
22
|
+
graphql_script = graphql_script.first if graphql_script.class == Array
|
22
23
|
|
23
24
|
if graphql_script.has_key?("author") && !graphql_script["author"].nil?
|
24
25
|
user = graphql_script["author"]
|
@@ -28,26 +29,28 @@ module Zorki
|
|
28
29
|
raise Zorki::Error unless username == scraped_username
|
29
30
|
|
30
31
|
number_of_posts = graphql_script["interactionStatistic"].select do |stat|
|
31
|
-
|
32
|
+
["https://schema.org/FilmAction", "http://schema.org/WriteAction"].include?(stat["interactionType"])
|
32
33
|
end.first
|
33
34
|
|
34
35
|
number_of_followers = graphql_script["interactionStatistic"].select do |stat|
|
35
36
|
stat["interactionType"] == "http://schema.org/FollowAction"
|
36
37
|
end.first
|
37
38
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
39
|
+
begin
|
40
|
+
profile_image_url = user["image"]
|
41
|
+
{
|
42
|
+
name: user["name"],
|
43
|
+
username: username,
|
44
|
+
number_of_posts: Integer(number_of_posts["userInteractionCount"]),
|
45
|
+
number_of_followers: Integer(number_of_followers["userInteractionCount"]),
|
46
|
+
# number_of_following: user["edge_follow"]["count"],
|
47
|
+
verified: user["is_verified"], # todo
|
48
|
+
profile: graphql_script["description"],
|
49
|
+
profile_link: user["sameAs"],
|
50
|
+
profile_image: Zorki.retrieve_media(profile_image_url),
|
51
|
+
profile_image_url: profile_image_url
|
52
|
+
}
|
53
|
+
end
|
51
54
|
else
|
52
55
|
user = graphql_script["data"]["user"]
|
53
56
|
|
data/lib/zorki/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zorki
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Christopher Guess
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-06-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|