zorki 0.1.24 → 0.1.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fa95bb5ca131ca6b7faed2aab200329579b77012cb079da75a0ad90a60daa5bd
4
- data.tar.gz: 82eb0cc29af2cfaeafa8774e027904959f953a50faa03d71dbcbeba9595ac520
3
+ metadata.gz: c5472c0d436e13f2e8554b59051546fed9400ad793de71b9b2d546bb5bd02d08
4
+ data.tar.gz: d62650105cb0f41a48a93d4379e077a4c1b658e96ae13a30c1d8073f8f2e0546
5
5
  SHA512:
6
- metadata.gz: 127638ce83ed09be71f194ba3e5dc269374890a1bd89874c8f158896a7664d41724698a258916f63e373b1c6f3dd3b4d315fac49a15e17ff457035ec4d345120
7
- data.tar.gz: 105b7e148774a82640ed48bffe46b21d941ce0a18987f59ef0e79a3a058077fbd9dda651811bbc421ff9fcc8323de152b2ac5afbff0e5b9d4e5c65d27f5ac85c
6
+ metadata.gz: 84a98236f4ca36daf440a8aea29acec2fa6963508bae78f5ee7c4d92c2ffedf19ef8db4050deadaa5090ea770132d2a47c64a1bab87f52329bdf18dd31f4aa2e
7
+ data.tar.gz: e1b635b352163d08dc0ea9b5e74b3cb990a4f9a7d91ce29296ae2150692612c2a7a81fc9e04bfd33cedfd5c4dab7031e5f06a802a25032aed15036550f306328
data/Gemfile.lock CHANGED
@@ -1,35 +1,36 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- zorki (0.1.9)
4
+ zorki (0.1.26)
5
5
  apparition
6
6
  capybara
7
7
  oj
8
8
  selenium-devtools
9
- selenium-webdriver (~> 4.19)
9
+ selenium-webdriver (~> 4.24.0)
10
10
  typhoeus
11
11
 
12
12
  GEM
13
13
  remote: https://rubygems.org/
14
14
  specs:
15
- activesupport (7.1.3.2)
15
+ activesupport (7.2.1)
16
16
  base64
17
17
  bigdecimal
18
- concurrent-ruby (~> 1.0, >= 1.0.2)
18
+ concurrent-ruby (~> 1.0, >= 1.3.1)
19
19
  connection_pool (>= 2.2.5)
20
20
  drb
21
21
  i18n (>= 1.6, < 2)
22
+ logger (>= 1.4.2)
22
23
  minitest (>= 5.1)
23
- mutex_m
24
- tzinfo (~> 2.0)
25
- addressable (2.8.6)
26
- public_suffix (>= 2.0.2, < 6.0)
24
+ securerandom (>= 0.3)
25
+ tzinfo (~> 2.0, >= 2.0.5)
26
+ addressable (2.8.7)
27
+ public_suffix (>= 2.0.2, < 7.0)
27
28
  apparition (0.6.0)
28
29
  capybara (~> 3.13, < 4)
29
30
  websocket-driver (>= 0.6.5)
30
31
  ast (2.4.2)
31
32
  base64 (0.2.0)
32
- bigdecimal (3.1.7)
33
+ bigdecimal (3.1.8)
33
34
  capybara (3.40.0)
34
35
  addressable
35
36
  matrix
@@ -39,75 +40,76 @@ GEM
39
40
  rack-test (>= 0.6.3)
40
41
  regexp_parser (>= 1.5, < 3.0)
41
42
  xpath (~> 3.2)
42
- concurrent-ruby (1.2.3)
43
+ concurrent-ruby (1.3.4)
43
44
  connection_pool (2.4.1)
44
- curb (1.0.5)
45
- debug (1.9.1)
45
+ curb (1.0.6)
46
+ debug (1.9.2)
46
47
  irb (~> 1.10)
47
48
  reline (>= 0.3.8)
48
49
  dotenv (2.7.6)
49
50
  drb (2.2.1)
50
51
  ethon (0.16.0)
51
52
  ffi (>= 1.15.0)
52
- ffi (1.16.3)
53
- i18n (1.14.4)
53
+ ffi (1.17.0-arm64-darwin)
54
+ i18n (1.14.5)
54
55
  concurrent-ruby (~> 1.0)
55
56
  io-console (0.7.2)
56
- irb (1.12.0)
57
- rdoc
57
+ irb (1.14.0)
58
+ rdoc (>= 4.0.0)
58
59
  reline (>= 0.4.2)
59
- json (2.7.1)
60
+ json (2.7.2)
60
61
  language_server-protocol (3.17.0.3)
62
+ logger (1.6.1)
61
63
  matrix (0.4.2)
62
64
  mini_mime (1.1.5)
63
- minitest (5.22.3)
64
- mutex_m (0.2.0)
65
- nokogiri (1.16.3-arm64-darwin)
65
+ minitest (5.25.1)
66
+ nokogiri (1.16.7-arm64-darwin)
66
67
  racc (~> 1.4)
67
- oj (3.16.3)
68
+ oj (3.16.6)
68
69
  bigdecimal (>= 3.0)
69
- parallel (1.24.0)
70
- parser (3.3.0.5)
70
+ ostruct (>= 0.2)
71
+ ostruct (0.6.0)
72
+ parallel (1.26.3)
73
+ parser (3.3.5.0)
71
74
  ast (~> 2.4.1)
72
75
  racc
73
76
  psych (5.1.2)
74
77
  stringio
75
- public_suffix (5.0.4)
76
- racc (1.7.3)
78
+ public_suffix (6.0.1)
79
+ racc (1.8.1)
77
80
  rack (3.0.8)
78
81
  rack-test (2.1.0)
79
82
  rack (>= 1.3)
80
83
  rainbow (3.1.1)
81
- rake (13.1.0)
82
- rdoc (6.6.3.1)
84
+ rake (13.2.1)
85
+ rdoc (6.7.0)
83
86
  psych (>= 4.0.0)
84
- regexp_parser (2.9.0)
85
- reline (0.5.0)
87
+ regexp_parser (2.9.2)
88
+ reline (0.5.10)
86
89
  io-console (~> 0.5)
87
- rexml (3.2.6)
88
- rubocop (1.62.1)
90
+ rexml (3.3.7)
91
+ rubocop (1.66.1)
89
92
  json (~> 2.3)
90
93
  language_server-protocol (>= 3.17.0)
91
94
  parallel (~> 1.10)
92
95
  parser (>= 3.3.0.2)
93
96
  rainbow (>= 2.2.2, < 4.0)
94
- regexp_parser (>= 1.8, < 3.0)
95
- rexml (>= 3.2.5, < 4.0)
96
- rubocop-ast (>= 1.31.1, < 2.0)
97
+ regexp_parser (>= 2.4, < 3.0)
98
+ rubocop-ast (>= 1.32.2, < 2.0)
97
99
  ruby-progressbar (~> 1.7)
98
100
  unicode-display_width (>= 2.4.0, < 3.0)
99
- rubocop-ast (1.31.2)
100
- parser (>= 3.3.0.4)
101
- rubocop-md (1.2.2)
102
- rubocop (>= 1.0)
103
- rubocop-minitest (0.35.0)
101
+ rubocop-ast (1.32.3)
102
+ parser (>= 3.3.1.0)
103
+ rubocop-md (1.2.3)
104
+ rubocop (>= 1.45)
105
+ rubocop-minitest (0.36.0)
104
106
  rubocop (>= 1.61, < 2.0)
105
107
  rubocop-ast (>= 1.31.1, < 2.0)
106
108
  rubocop-packaging (0.5.2)
107
109
  rubocop (>= 1.33, < 2.0)
108
- rubocop-performance (1.20.2)
110
+ rubocop-performance (1.21.1)
109
111
  rubocop (>= 1.48.1, < 2.0)
110
- rubocop-ast (>= 1.30.0, < 2.0)
112
+ rubocop-ast (>= 1.31.1, < 2.0)
111
113
  rubocop-rails (2.19.1)
112
114
  activesupport (>= 4.2.0)
113
115
  rack (>= 1.1)
@@ -122,20 +124,22 @@ GEM
122
124
  rubocop-rails (~> 2.0)
123
125
  ruby-progressbar (1.13.0)
124
126
  rubyzip (2.3.2)
125
- selenium-devtools (0.123.0)
127
+ securerandom (0.3.1)
128
+ selenium-devtools (0.128.0)
126
129
  selenium-webdriver (~> 4.2)
127
- selenium-webdriver (4.19.0)
130
+ selenium-webdriver (4.24.0)
128
131
  base64 (~> 0.2)
132
+ logger (~> 1.4)
129
133
  rexml (~> 3.2, >= 3.2.5)
130
134
  rubyzip (>= 1.2.2, < 3.0)
131
135
  websocket (~> 1.0)
132
- stringio (3.1.0)
136
+ stringio (3.1.1)
133
137
  typhoeus (1.4.1)
134
138
  ethon (>= 0.9.0)
135
139
  tzinfo (2.0.6)
136
140
  concurrent-ruby (~> 1.0)
137
- unicode-display_width (2.5.0)
138
- websocket (1.2.10)
141
+ unicode-display_width (2.6.0)
142
+ websocket (1.2.11)
139
143
  websocket-driver (0.7.6)
140
144
  websocket-extensions (>= 0.1.0)
141
145
  websocket-extensions (0.1.5)
@@ -51,8 +51,13 @@ module Zorki
51
51
  # additional_search_params is a comma seperated keys
52
52
  # example: `data,xdt_api__v1__media__shortcode__web_info,items`
53
53
  #
54
+ # NOTE: `post_data_include` if not nil overrules the additional_search_parameters
55
+ # This is so that i didn't have to refactor the entire code base when I added it.
56
+ # Eventually it might be better to look at the post request and see if we can do the
57
+ # same type of search there as we use for users and simplify this whole thing a lot.
58
+ #
54
59
  # @returns Hash a ruby hash of the JSON data
55
- def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil)
60
+ def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil, post_data_include: nil)
56
61
  # So this is fun:
57
62
  # For pages marked as misinformation we have to use one method (interception of requrest) and
58
63
  # for pages that are not, we can just pull the data straight from the page.
@@ -65,21 +70,28 @@ module Zorki
65
70
  # the one we want, and then moves on.
66
71
  response_body = nil
67
72
 
73
+ responses = []
68
74
  page.driver.browser.intercept do |request, &continue|
69
75
  # This passes the request forward unmodified, since we only care about the response
70
- # puts "checking request: #{request.url}"
76
+ #
77
+ # responses.first.post_data.include?("render_surface%22%3A%22PROFILE")
71
78
  continue.call(request) && next unless request.url.include?(subpage_search)
79
+ continue.call(request) && next unless !post_data_include.nil? && request.post_data.include?(post_data_include)
72
80
 
73
81
  continue.call(request) do |response|
82
+ puts "***********************************************************"
83
+ puts "checking request: #{request.url}"
84
+ puts response.body
85
+ puts "***********************************************************"
86
+
87
+ # responses << response
74
88
  # Check if not a CORS prefetch and finish up if not
75
89
  if !response.body&.empty? && response.body
76
90
  check_passed = true
77
91
 
78
- unless additional_search_parameters.nil?
92
+ if !additional_search_parameters.nil? && post_data_include.nil?
79
93
  body_to_check = Oj.load(response.body)
80
94
 
81
- debugger if body_to_check.include?("jokoy.komi.io")
82
-
83
95
  search_parameters = additional_search_parameters.split(",")
84
96
  search_parameters.each_with_index do |key, index|
85
97
  break if body_to_check.nil?
@@ -89,6 +101,13 @@ module Zorki
89
101
  end
90
102
  end
91
103
 
104
+ if check_passed == false
105
+ puts "***********************************************************"
106
+ puts "checking FAILED request: #{request.url}"
107
+ puts response.body
108
+ puts "***********************************************************"
109
+ end
110
+
92
111
  response_body = response.body if check_passed == true
93
112
  end
94
113
  end
@@ -120,17 +139,17 @@ module Zorki
120
139
  elements = doc.search("script").filter_map do |element|
121
140
  parsed_element_json = nil
122
141
  begin
123
- element_json = JSON.parse(element.text)
142
+ element_json = OJ.load(element.text)
124
143
 
125
- if element.text.include?("jokoy.komi.io")
126
- debugger
144
+ # if element.text.include?("jokoy.komi.io")
145
+ # debugger
127
146
  # if element_json["require"].first.last.first["__bbox"].key?("require")
128
147
 
129
148
  # element_json["require"].first.last.first["__bbox"]["require"].each do |x|
130
149
  # debugger if x.to_s.include?("Si mulut pelaut")
131
150
  # end
132
151
  # end
133
- end
152
+ # end
134
153
 
135
154
  parsed_element_json = element_json["require"].last.last.first["__bbox"]["require"].first.last.last["__bbox"]["result"]["data"]["xdt_api__v1__media__shortcode__web_info"]
136
155
  rescue StandardError
@@ -141,12 +160,14 @@ module Zorki
141
160
  end
142
161
 
143
162
  if elements&.empty?
163
+ # debugger
144
164
  raise ContentUnavailableError.new("Cannot find anything", additional_data: { page_source: page.driver.browser.page_source, elements: elements })
145
165
  end
146
166
 
147
167
  return elements
148
168
  end
149
169
 
170
+ # debugger if response_body.nil?
150
171
  raise ContentUnavailableError.new("Response body nil") if response_body.nil?
151
172
  Oj.load(response_body)
152
173
  ensure
@@ -16,13 +16,33 @@ module Zorki
16
16
  # - *description
17
17
  # - *links
18
18
  # - *Profile image
19
- login
20
19
 
21
- graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql", "data,user,full_name")
22
- graphql_script = graphql_script.first if graphql_script.class == Array
20
+ graphql_script = nil
21
+ count = 0
22
+ loop do
23
+ print "Scraping user #{username}... (attempt #{count + 1})\n"
24
+ begin
25
+ login
26
+
27
+ graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql/query", "data,user,media_count", post_data_include: "render_surface")
28
+ graphql_script = graphql_script.first if graphql_script.class == Array
29
+
30
+ if graphql_script.nil?
31
+ graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "web_profile_info")
32
+ end
33
+ rescue Zorki::ContentUnavailableError => e
34
+ count += 1
35
+
36
+ if count > 3
37
+ raise Zorki::UserScrapingError.new("Zorki could not find user #{username}", additional_data: { username: username })
38
+ end
23
39
 
24
- if graphql_script.nil?
25
- graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "web_profile_info")
40
+ page.driver.browser.navigate.to("https://www.instagram.com") # we want to go back to the main page so we start from scratch
41
+ sleep rand(5..10)
42
+ next
43
+ end
44
+
45
+ break
26
46
  end
27
47
 
28
48
  if graphql_script.has_key?("author") && !graphql_script["author"].nil?
@@ -36,10 +56,14 @@ module Zorki
36
56
  ["https://schema.org/FilmAction", "http://schema.org/WriteAction"].include?(stat["interactionType"])
37
57
  end.first
38
58
 
59
+ # number_of_posts = graphql_script["data"]["user"]["media_count"] if number_of_posts.nil?
60
+
39
61
  number_of_followers = graphql_script["interactionStatistic"].select do |stat|
40
62
  stat["interactionType"] == "http://schema.org/FollowAction"
41
63
  end.first
42
64
 
65
+ # number_of_followers = graphql_script["data"]["user"]["follower_count"] if number_of_followers.nil?
66
+
43
67
  begin
44
68
  profile_image_url = user["image"]
45
69
  {
@@ -76,7 +100,8 @@ module Zorki
76
100
  profile_image_url: profile_image_url
77
101
  }
78
102
  end
79
- rescue Zorki::ContentUnavailableError
103
+ rescue Zorki::ContentUnavailableError => e
104
+ debugger
80
105
  raise Zorki::UserScrapingError.new("Zorki could not find user #{username}", additional_data: { username: username })
81
106
  end
82
107
  end
data/lib/zorki/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Zorki
4
- VERSION = "0.1.24"
4
+ VERSION = "0.1.27"
5
5
  end
data/lib/zorki.rb CHANGED
@@ -28,7 +28,7 @@ module Zorki
28
28
  end
29
29
 
30
30
  def to_honeybadger_context
31
- additional_data
31
+ @additional_data
32
32
  end
33
33
  end
34
34
 
@@ -39,7 +39,7 @@ module Zorki
39
39
  end
40
40
 
41
41
  def to_honeybadger_context
42
- additional_data
42
+ @additional_data
43
43
  end
44
44
  end
45
45
 
data/zorki.gemspec CHANGED
@@ -33,7 +33,7 @@ Gem::Specification.new do |spec|
33
33
  spec.add_dependency "apparition" # A Chrome driver for Capybara
34
34
  spec.add_dependency "typhoeus" # For making API requests
35
35
  spec.add_dependency "oj" # A faster JSON parser/loader than stdlib
36
- spec.add_dependency "selenium-webdriver", "~> 4.19" # Webdriver selenium
36
+ spec.add_dependency "selenium-webdriver", "~> 4.24.0" # Webdriver selenium
37
37
  spec.add_dependency "selenium-devtools" # Allow us to intercept requests
38
38
 
39
39
  spec.add_development_dependency "debug"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: zorki
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.24
4
+ version: 0.1.27
5
5
  platform: ruby
6
6
  authors:
7
7
  - Christopher Guess
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-06-19 00:00:00.000000000 Z
11
+ date: 2024-10-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: capybara
@@ -72,14 +72,14 @@ dependencies:
72
72
  requirements:
73
73
  - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '4.19'
75
+ version: 4.24.0
76
76
  type: :runtime
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '4.19'
82
+ version: 4.24.0
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: selenium-devtools
85
85
  requirement: !ruby/object:Gem::Requirement