zorki 0.1.1 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 384abb984483d4f38da44e77b7cd359e7d3aadea7c96765bc5b2837c3e79e99d
4
- data.tar.gz: 48bf93b460b4c947cc9bf47f97dc469de124686cb763eccfa8ef7ef00e11f2b6
3
+ metadata.gz: 8e9aca1027df18f607fac0616a75a4b8ac11728cbac5de9205f014c57306f82e
4
+ data.tar.gz: c22375fd87090060642780ae2ec472d505f66c7ed1579d35d8eb0e9fe02bd976
5
5
  SHA512:
6
- metadata.gz: e70b44d624a51df5a7ca6d9b6078cf3b44c8da1c07291aee60e4d554e2bc5705cfab3cea112ccbc79f6354d1092dca90330b7697f4e19606034857de361a08e3
7
- data.tar.gz: b819d82e750f72cb55ab34b2a7b8332c70828220857501c22315480e644c2b88d5732f3b4074d7d240015a70dadace213b4a1e388145a6d50d544ef36b20b655
6
+ metadata.gz: b3e3117ca0903a23fd2303ec938d70d1d8dd8f82eeb510019832e25e286a4c5ae665c707cc62523dedf9617de9239072aa02d8877fb8e2fae2ea4f85ad14bbc9
7
+ data.tar.gz: 7a2b9d079041484f5553bf5be6fa391bbeb20c8660fb07df69cd7b6feb28d8f7d315e22353d89ff1ccedf7304e39dfbab9f8f18e9f25ad44cd7067e473d515d1
data/Gemfile CHANGED
@@ -15,3 +15,5 @@ gem "rubocop-rails", "~> 2.19.1", require: false # Rails specific styles
15
15
  gem "rubocop-rails_config"
16
16
 
17
17
  gem "dotenv", "~> 2.7.6"
18
+
19
+ gem "curb", "~> 1.0", ">= 1.0.5"
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- zorki (0.1.0)
4
+ zorki (0.1.2)
5
5
  apparition
6
6
  capybara
7
7
  oj
@@ -12,20 +12,20 @@ PATH
12
12
  GEM
13
13
  remote: https://rubygems.org/
14
14
  specs:
15
- actionpack (7.0.4.3)
16
- actionview (= 7.0.4.3)
17
- activesupport (= 7.0.4.3)
18
- rack (~> 2.0, >= 2.2.0)
15
+ actionpack (7.0.5)
16
+ actionview (= 7.0.5)
17
+ activesupport (= 7.0.5)
18
+ rack (~> 2.0, >= 2.2.4)
19
19
  rack-test (>= 0.6.3)
20
20
  rails-dom-testing (~> 2.0)
21
21
  rails-html-sanitizer (~> 1.0, >= 1.2.0)
22
- actionview (7.0.4.3)
23
- activesupport (= 7.0.4.3)
22
+ actionview (7.0.5)
23
+ activesupport (= 7.0.5)
24
24
  builder (~> 3.1)
25
25
  erubi (~> 1.4)
26
26
  rails-dom-testing (~> 2.0)
27
27
  rails-html-sanitizer (~> 1.1, >= 1.2.0)
28
- activesupport (7.0.4.3)
28
+ activesupport (7.0.5)
29
29
  concurrent-ruby (~> 1.0, >= 1.0.2)
30
30
  i18n (>= 1.6, < 2)
31
31
  minitest (>= 5.1)
@@ -37,7 +37,7 @@ GEM
37
37
  websocket-driver (>= 0.6.5)
38
38
  ast (2.4.2)
39
39
  builder (3.2.4)
40
- capybara (3.39.1)
40
+ capybara (3.39.2)
41
41
  addressable
42
42
  matrix
43
43
  mini_mime (>= 0.1.3)
@@ -48,59 +48,62 @@ GEM
48
48
  xpath (~> 3.2)
49
49
  concurrent-ruby (1.2.2)
50
50
  crass (1.0.6)
51
+ curb (1.0.5)
51
52
  dotenv (2.7.6)
52
53
  erubi (1.12.0)
53
54
  ethon (0.16.0)
54
55
  ffi (>= 1.15.0)
55
56
  ffi (1.15.5)
56
- i18n (1.13.0)
57
+ i18n (1.14.1)
57
58
  concurrent-ruby (~> 1.0)
58
59
  json (2.6.3)
59
- loofah (2.21.2)
60
+ loofah (2.21.3)
60
61
  crass (~> 1.0.2)
61
62
  nokogiri (>= 1.12.0)
62
63
  matrix (0.4.2)
63
64
  method_source (1.0.0)
64
65
  mini_mime (1.1.2)
65
66
  minitest (5.18.0)
66
- nokogiri (1.14.4-arm64-darwin)
67
+ nokogiri (1.15.2-arm64-darwin)
67
68
  racc (~> 1.4)
68
- oj (3.14.3)
69
+ oj (3.15.0)
69
70
  parallel (1.23.0)
70
- parser (3.2.2.1)
71
+ parser (3.2.2.3)
71
72
  ast (~> 2.4.1)
73
+ racc
72
74
  public_suffix (5.0.1)
73
- racc (1.6.2)
75
+ racc (1.7.0)
74
76
  rack (2.2.7)
75
77
  rack-test (2.1.0)
76
78
  rack (>= 1.3)
77
79
  rails-dom-testing (2.0.3)
78
80
  activesupport (>= 4.2.0)
79
81
  nokogiri (>= 1.6)
80
- rails-html-sanitizer (1.5.0)
81
- loofah (~> 2.19, >= 2.19.1)
82
- railties (7.0.4.3)
83
- actionpack (= 7.0.4.3)
84
- activesupport (= 7.0.4.3)
82
+ rails-html-sanitizer (1.6.0)
83
+ loofah (~> 2.21)
84
+ nokogiri (~> 1.14)
85
+ railties (7.0.5)
86
+ actionpack (= 7.0.5)
87
+ activesupport (= 7.0.5)
85
88
  method_source
86
89
  rake (>= 12.2)
87
90
  thor (~> 1.0)
88
91
  zeitwerk (~> 2.5)
89
92
  rainbow (3.1.1)
90
93
  rake (13.0.6)
91
- regexp_parser (2.8.0)
94
+ regexp_parser (2.8.1)
92
95
  rexml (3.2.5)
93
- rubocop (1.51.0)
96
+ rubocop (1.52.1)
94
97
  json (~> 2.3)
95
98
  parallel (~> 1.10)
96
- parser (>= 3.2.0.0)
99
+ parser (>= 3.2.2.3)
97
100
  rainbow (>= 2.2.2, < 4.0)
98
101
  regexp_parser (>= 1.8, < 3.0)
99
102
  rexml (>= 3.2.5, < 4.0)
100
103
  rubocop-ast (>= 1.28.0, < 2.0)
101
104
  ruby-progressbar (~> 1.7)
102
105
  unicode-display_width (>= 2.4.0, < 3.0)
103
- rubocop-ast (1.28.1)
106
+ rubocop-ast (1.29.0)
104
107
  parser (>= 3.2.1.0)
105
108
  rubocop-md (1.2.0)
106
109
  rubocop (>= 1.0)
@@ -108,7 +111,7 @@ GEM
108
111
  rubocop (>= 1.39, < 2.0)
109
112
  rubocop-packaging (0.5.2)
110
113
  rubocop (>= 1.33, < 2.0)
111
- rubocop-performance (1.17.1)
114
+ rubocop-performance (1.18.0)
112
115
  rubocop (>= 1.7.0, < 2.0)
113
116
  rubocop-ast (>= 0.4.0)
114
117
  rubocop-rails (2.19.1)
@@ -126,9 +129,9 @@ GEM
126
129
  rubocop-rails (~> 2.0)
127
130
  ruby-progressbar (1.13.0)
128
131
  rubyzip (2.3.2)
129
- selenium-devtools (0.113.0)
132
+ selenium-devtools (0.114.0)
130
133
  selenium-webdriver (~> 4.2)
131
- selenium-webdriver (4.9.1)
134
+ selenium-webdriver (4.10.0)
132
135
  rexml (~> 3.2, >= 3.2.5)
133
136
  rubyzip (>= 1.2.2, < 3.0)
134
137
  websocket (~> 1.0)
@@ -150,6 +153,7 @@ PLATFORMS
150
153
  arm64-darwin-22
151
154
 
152
155
  DEPENDENCIES
156
+ curb (~> 1.0, >= 1.0.5)
153
157
  dotenv (~> 2.7.6)
154
158
  minitest (~> 5.0)
155
159
  rake (~> 13.0)
@@ -45,15 +45,17 @@ module Zorki
45
45
  end.first["userInteractionCount"]
46
46
 
47
47
  unless graphql_object["video"].empty?
48
- video = graphql_object["video"].first["contentUrl"]
49
- video_preview_image = graphql_object["video"].first["thumbnailUrl"]
48
+ video_url = graphql_object["video"].first["contentUrl"]
49
+ video = Zorki.retrieve_media(video_url)
50
+
51
+ video_preview_image_url = graphql_object["video"].first["thumbnailUrl"]
52
+ video_preview_image = Zorki.retrieve_media(video_preview_image_url)
50
53
  end
51
54
  else
52
55
  # We need to see if this is a single image post or a slideshow. We do that
53
56
  # by looking for a single image, if it's not there, we assume the alternative.
54
57
  graphql_object = graphql_object["data"]["xdt_api__v1__media__shortcode__web_info"]
55
58
 
56
-
57
59
  unless graphql_object["items"][0].has_key?("video_versions") && !graphql_object["items"][0]["video_versions"].nil?
58
60
  # Check if there is a slideshow or not
59
61
  unless graphql_object["items"][0].has_key?("carousel_media") && !graphql_object["items"][0]["carousel_media"].nil?
@@ -7,6 +7,7 @@ require "selenium-webdriver"
7
7
  require "logger"
8
8
  require "debug"
9
9
  require "securerandom"
10
+ require "selenium/webdriver/remote/http/curb"
10
11
 
11
12
  # 2022-06-07 14:15:23 WARN Selenium [DEPRECATION] [:browser_options] :options as a parameter for driver initialization is deprecated. Use :capabilities with an Array of value capabilities/options if necessary instead.
12
13
 
@@ -16,13 +17,14 @@ options.add_argument("--no-sandbox")
16
17
  options.add_argument("--disable-dev-shm-usage")
17
18
  options.add_argument("–-disable-blink-features=AutomationControlled")
18
19
  options.add_argument("--disable-extensions")
20
+ options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
19
21
  options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
20
22
  options.add_preference "password_manager_enabled", false
21
23
  options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
22
24
 
23
25
  Capybara.register_driver :selenium_zorki do |app|
24
- client = Selenium::WebDriver::Remote::Http::Default.new
25
- client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
26
+ client = Selenium::WebDriver::Remote::Http::Curb.new
27
+ # client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
26
28
  Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
27
29
  end
28
30
 
@@ -94,7 +96,7 @@ module Zorki
94
96
  end
95
97
 
96
98
  # Now that the intercept is set up, we visit the page we want
97
- visit(url)
99
+ page.driver.browser.navigate.to(url)
98
100
  # We wait until the correct intercept is processed or we've waited 60 seconds
99
101
  start_time = Time.now
100
102
  # puts "Waiting.... #{url}"
@@ -134,14 +136,16 @@ module Zorki
134
136
  options.add_argument("--disable-dev-shm-usage")
135
137
  options.add_argument("–-disable-blink-features=AutomationControlled")
136
138
  options.add_argument("--disable-extensions")
139
+ options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
140
+
137
141
  options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
138
142
  options.add_preference "password_manager_enabled", false
139
143
  options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
140
144
  # options.add_argument("--user-data-dir=/tmp/tarun")
141
145
 
142
146
  Capybara.register_driver :selenium do |app|
143
- client = Selenium::WebDriver::Remote::Http::Default.new
144
- client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
147
+ client = Selenium::WebDriver::Remote::Http::Curb.new
148
+ # client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
145
149
  Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
146
150
  end
147
151
 
@@ -153,7 +157,12 @@ module Zorki
153
157
  page.quit
154
158
 
155
159
  # Check if we're on a Instagram page already, if not visit it.
156
- visit ("https://instagram.com") unless page.driver.browser.current_url.include? "instagram.com"
160
+ unless page.driver.browser.current_url.include? "instagram.com"
161
+ # There seems to be a bug in the Linux ARM64 version of chromedriver where this will properly
162
+ # navigate but then timeout, crashing it all up. So instead we check and raise the error when
163
+ # that then fails again.
164
+ page.driver.browser.navigate.to("https://instagram.com")
165
+ end
157
166
 
158
167
  # We don't have to login if we already are
159
168
  begin
@@ -19,6 +19,7 @@ module Zorki
19
19
  login
20
20
 
21
21
  graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "?username=")
22
+ graphql_script = graphql_script.first if graphql_script.class == Array
22
23
 
23
24
  if graphql_script.has_key?("author") && !graphql_script["author"].nil?
24
25
  user = graphql_script["author"]
@@ -28,26 +29,28 @@ module Zorki
28
29
  raise Zorki::Error unless username == scraped_username
29
30
 
30
31
  number_of_posts = graphql_script["interactionStatistic"].select do |stat|
31
- stat["interactionType"] == "https://schema.org/FilmAction"
32
+ ["https://schema.org/FilmAction", "http://schema.org/WriteAction"].include?(stat["interactionType"])
32
33
  end.first
33
34
 
34
35
  number_of_followers = graphql_script["interactionStatistic"].select do |stat|
35
36
  stat["interactionType"] == "http://schema.org/FollowAction"
36
37
  end.first
37
38
 
38
- profile_image_url = user["image"]
39
- {
40
- name: user["name"],
41
- username: username,
42
- number_of_posts: Integer(number_of_posts["userInteractionCount"]),
43
- number_of_followers: Integer(number_of_followers["userInteractionCount"]),
44
- # number_of_following: user["edge_follow"]["count"],
45
- verified: user["is_verified"], # todo
46
- profile: graphql_script["description"],
47
- profile_link: user["sameAs"],
48
- profile_image: Zorki.retrieve_media(profile_image_url),
49
- profile_image_url: profile_image_url
50
- }
39
+ begin
40
+ profile_image_url = user["image"]
41
+ {
42
+ name: user["name"],
43
+ username: username,
44
+ number_of_posts: Integer(number_of_posts["userInteractionCount"]),
45
+ number_of_followers: Integer(number_of_followers["userInteractionCount"]),
46
+ # number_of_following: user["edge_follow"]["count"],
47
+ verified: user["is_verified"], # todo
48
+ profile: graphql_script["description"],
49
+ profile_link: user["sameAs"],
50
+ profile_image: Zorki.retrieve_media(profile_image_url),
51
+ profile_image_url: profile_image_url
52
+ }
53
+ end
51
54
  else
52
55
  user = graphql_script["data"]["user"]
53
56
 
data/lib/zorki/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Zorki
4
- VERSION = "0.1.1"
4
+ VERSION = "0.1.3"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: zorki
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Christopher Guess
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-05-23 00:00:00.000000000 Z
11
+ date: 2023-06-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: capybara