zorki 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/Gemfile.lock +3 -1
- data/lib/zorki/scrapers/post_scraper.rb +5 -3
- data/lib/zorki/scrapers/scraper.rb +15 -6
- data/lib/zorki/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e866c6873fc604f1720faeaf9425ff0557a6ef9947e9a9e4f657cc255a8d4665
|
4
|
+
data.tar.gz: e97fa8e84dd02edc9dd47beb9d66fe2b3cab4704297f16005dc09adf68eeb2e4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f5d6f36fede35c5d54289f6f7a41098e5374f74e8bbbf18ebd66486e4c049fdd064e063a7787000ed5997a9460496d9c5310795585a9098b1bfcbf891c7277e6
|
7
|
+
data.tar.gz: 8a79812eb8d312fef165868bbaad85cc4a7cd0f67ed04fb3e7cb0246e417ec5eefb89a54d3eec7ff78b3a3b38a9df89ae8c82423ea7fbb4ab04d160c86b397f5
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
zorki (0.1.
|
4
|
+
zorki (0.1.1)
|
5
5
|
apparition
|
6
6
|
capybara
|
7
7
|
oj
|
@@ -48,6 +48,7 @@ GEM
|
|
48
48
|
xpath (~> 3.2)
|
49
49
|
concurrent-ruby (1.2.2)
|
50
50
|
crass (1.0.6)
|
51
|
+
curb (1.0.5)
|
51
52
|
dotenv (2.7.6)
|
52
53
|
erubi (1.12.0)
|
53
54
|
ethon (0.16.0)
|
@@ -150,6 +151,7 @@ PLATFORMS
|
|
150
151
|
arm64-darwin-22
|
151
152
|
|
152
153
|
DEPENDENCIES
|
154
|
+
curb (~> 1.0, >= 1.0.5)
|
153
155
|
dotenv (~> 2.7.6)
|
154
156
|
minitest (~> 5.0)
|
155
157
|
rake (~> 13.0)
|
@@ -45,15 +45,17 @@ module Zorki
|
|
45
45
|
end.first["userInteractionCount"]
|
46
46
|
|
47
47
|
unless graphql_object["video"].empty?
|
48
|
-
|
49
|
-
|
48
|
+
video_url = graphql_object["video"].first["contentUrl"]
|
49
|
+
video = Zorki.retrieve_media(video_url)
|
50
|
+
|
51
|
+
video_preview_image_url = graphql_object["video"].first["thumbnailUrl"]
|
52
|
+
video_preview_image = Zorki.retrieve_media(video_preview_image_url)
|
50
53
|
end
|
51
54
|
else
|
52
55
|
# We need to see if this is a single image post or a slideshow. We do that
|
53
56
|
# by looking for a single image, if it's not there, we assume the alternative.
|
54
57
|
graphql_object = graphql_object["data"]["xdt_api__v1__media__shortcode__web_info"]
|
55
58
|
|
56
|
-
|
57
59
|
unless graphql_object["items"][0].has_key?("video_versions") && !graphql_object["items"][0]["video_versions"].nil?
|
58
60
|
# Check if there is a slideshow or not
|
59
61
|
unless graphql_object["items"][0].has_key?("carousel_media") && !graphql_object["items"][0]["carousel_media"].nil?
|
@@ -7,6 +7,7 @@ require "selenium-webdriver"
|
|
7
7
|
require "logger"
|
8
8
|
require "debug"
|
9
9
|
require "securerandom"
|
10
|
+
require "selenium/webdriver/remote/http/curb"
|
10
11
|
|
11
12
|
# 2022-06-07 14:15:23 WARN Selenium [DEPRECATION] [:browser_options] :options as a parameter for driver initialization is deprecated. Use :capabilities with an Array of value capabilities/options if necessary instead.
|
12
13
|
|
@@ -16,13 +17,14 @@ options.add_argument("--no-sandbox")
|
|
16
17
|
options.add_argument("--disable-dev-shm-usage")
|
17
18
|
options.add_argument("–-disable-blink-features=AutomationControlled")
|
18
19
|
options.add_argument("--disable-extensions")
|
20
|
+
options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
|
19
21
|
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
|
20
22
|
options.add_preference "password_manager_enabled", false
|
21
23
|
options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
|
22
24
|
|
23
25
|
Capybara.register_driver :selenium_zorki do |app|
|
24
|
-
client = Selenium::WebDriver::Remote::Http::
|
25
|
-
client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
26
|
+
client = Selenium::WebDriver::Remote::Http::Curb.new
|
27
|
+
# client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
26
28
|
Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
|
27
29
|
end
|
28
30
|
|
@@ -94,7 +96,7 @@ module Zorki
|
|
94
96
|
end
|
95
97
|
|
96
98
|
# Now that the intercept is set up, we visit the page we want
|
97
|
-
|
99
|
+
page.driver.browser.navigate.to(url)
|
98
100
|
# We wait until the correct intercept is processed or we've waited 60 seconds
|
99
101
|
start_time = Time.now
|
100
102
|
# puts "Waiting.... #{url}"
|
@@ -134,14 +136,16 @@ module Zorki
|
|
134
136
|
options.add_argument("--disable-dev-shm-usage")
|
135
137
|
options.add_argument("–-disable-blink-features=AutomationControlled")
|
136
138
|
options.add_argument("--disable-extensions")
|
139
|
+
options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
|
140
|
+
|
137
141
|
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
|
138
142
|
options.add_preference "password_manager_enabled", false
|
139
143
|
options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
|
140
144
|
# options.add_argument("--user-data-dir=/tmp/tarun")
|
141
145
|
|
142
146
|
Capybara.register_driver :selenium do |app|
|
143
|
-
client = Selenium::WebDriver::Remote::Http::
|
144
|
-
client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
147
|
+
client = Selenium::WebDriver::Remote::Http::Curb.new
|
148
|
+
# client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
145
149
|
Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
|
146
150
|
end
|
147
151
|
|
@@ -153,7 +157,12 @@ module Zorki
|
|
153
157
|
page.quit
|
154
158
|
|
155
159
|
# Check if we're on a Instagram page already, if not visit it.
|
156
|
-
|
160
|
+
unless page.driver.browser.current_url.include? "instagram.com"
|
161
|
+
# There seems to be a bug in the Linux ARM64 version of chromedriver where this will properly
|
162
|
+
# navigate but then timeout, crashing it all up. So instead we check and raise the error when
|
163
|
+
# that then fails again.
|
164
|
+
page.driver.browser.navigate.to("https://instagram.com")
|
165
|
+
end
|
157
166
|
|
158
167
|
# We don't have to login if we already are
|
159
168
|
begin
|
data/lib/zorki/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zorki
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Christopher Guess
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-05-
|
11
|
+
date: 2023-05-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|