zorki 0.1.26 → 0.1.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ec1dba1c80d66f33d1e50fc104511a5842091529c6d2b1e3f4758508ae37468a
4
- data.tar.gz: b510d48cbc12d0e9b70fb17fb91aeffdad51d85f85e5813695df1c86279894db
3
+ metadata.gz: c5472c0d436e13f2e8554b59051546fed9400ad793de71b9b2d546bb5bd02d08
4
+ data.tar.gz: d62650105cb0f41a48a93d4379e077a4c1b658e96ae13a30c1d8073f8f2e0546
5
5
  SHA512:
6
- metadata.gz: 6463fc59b818e21d4c515212d04549b7a3b925433d7e9df79ff376a786f74e028ab99146b8422c64384470443d88835854717e3a19a7c68bb3a767e7277f1c39
7
- data.tar.gz: 173b570bb5eb62d759a488c21aff0092c633db1f0f1a4fa771e46ad32141a2016386f698e724cbc1a7dced5ad66a9d7b22add8f7dd533d8dc88ec009ca9a2814
6
+ metadata.gz: 84a98236f4ca36daf440a8aea29acec2fa6963508bae78f5ee7c4d92c2ffedf19ef8db4050deadaa5090ea770132d2a47c64a1bab87f52329bdf18dd31f4aa2e
7
+ data.tar.gz: e1b635b352163d08dc0ea9b5e74b3cb990a4f9a7d91ce29296ae2150692612c2a7a81fc9e04bfd33cedfd5c4dab7031e5f06a802a25032aed15036550f306328
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- zorki (0.1.25)
4
+ zorki (0.1.26)
5
5
  apparition
6
6
  capybara
7
7
  oj
@@ -51,8 +51,13 @@ module Zorki
51
51
  # additional_search_params is a comma seperated keys
52
52
  # example: `data,xdt_api__v1__media__shortcode__web_info,items`
53
53
  #
54
+ # NOTE: `post_data_include` if not nil overrules the additional_search_parameters
55
+ # This is so that i didn't have to refactor the entire code base when I added it.
56
+ # Eventually it might be better to look at the post request and see if we can do the
57
+ # same type of search there as we use for users and simplify this whole thing a lot.
58
+ #
54
59
  # @returns Hash a ruby hash of the JSON data
55
- def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil)
60
+ def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil, post_data_include: nil)
56
61
  # So this is fun:
57
62
  # For pages marked as misinformation we have to use one method (interception of requrest) and
58
63
  # for pages that are not, we can just pull the data straight from the page.
@@ -65,21 +70,28 @@ module Zorki
65
70
  # the one we want, and then moves on.
66
71
  response_body = nil
67
72
 
73
+ responses = []
68
74
  page.driver.browser.intercept do |request, &continue|
69
75
  # This passes the request forward unmodified, since we only care about the response
70
- # puts "checking request: #{request.url}"
76
+ #
77
+ # responses.first.post_data.include?("render_surface%22%3A%22PROFILE")
71
78
  continue.call(request) && next unless request.url.include?(subpage_search)
79
+ continue.call(request) && next unless !post_data_include.nil? && request.post_data.include?(post_data_include)
72
80
 
73
81
  continue.call(request) do |response|
82
+ puts "***********************************************************"
83
+ puts "checking request: #{request.url}"
84
+ puts response.body
85
+ puts "***********************************************************"
86
+
87
+ # responses << response
74
88
  # Check if not a CORS prefetch and finish up if not
75
89
  if !response.body&.empty? && response.body
76
90
  check_passed = true
77
91
 
78
- unless additional_search_parameters.nil?
92
+ if !additional_search_parameters.nil? && post_data_include.nil?
79
93
  body_to_check = Oj.load(response.body)
80
94
 
81
- debugger if body_to_check.include?("jokoy.komi.io")
82
-
83
95
  search_parameters = additional_search_parameters.split(",")
84
96
  search_parameters.each_with_index do |key, index|
85
97
  break if body_to_check.nil?
@@ -89,6 +101,13 @@ module Zorki
89
101
  end
90
102
  end
91
103
 
104
+ if check_passed == false
105
+ puts "***********************************************************"
106
+ puts "checking FAILED request: #{request.url}"
107
+ puts response.body
108
+ puts "***********************************************************"
109
+ end
110
+
92
111
  response_body = response.body if check_passed == true
93
112
  end
94
113
  end
@@ -120,17 +139,17 @@ module Zorki
120
139
  elements = doc.search("script").filter_map do |element|
121
140
  parsed_element_json = nil
122
141
  begin
123
- element_json = JSON.parse(element.text)
142
+ element_json = OJ.load(element.text)
124
143
 
125
- if element.text.include?("jokoy.komi.io")
126
- debugger
144
+ # if element.text.include?("jokoy.komi.io")
145
+ # debugger
127
146
  # if element_json["require"].first.last.first["__bbox"].key?("require")
128
147
 
129
148
  # element_json["require"].first.last.first["__bbox"]["require"].each do |x|
130
149
  # debugger if x.to_s.include?("Si mulut pelaut")
131
150
  # end
132
151
  # end
133
- end
152
+ # end
134
153
 
135
154
  parsed_element_json = element_json["require"].last.last.first["__bbox"]["require"].first.last.last["__bbox"]["result"]["data"]["xdt_api__v1__media__shortcode__web_info"]
136
155
  rescue StandardError
@@ -20,13 +20,11 @@ module Zorki
20
20
  graphql_script = nil
21
21
  count = 0
22
22
  loop do
23
- raise Zorki::UserScrapingError.new("Zorki could not find user #{username}", additional_data: { username: username }) if count > 3
24
-
25
23
  print "Scraping user #{username}... (attempt #{count + 1})\n"
26
24
  begin
27
25
  login
28
26
 
29
- graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql", "data,user,media_count")
27
+ graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql/query", "data,user,media_count", post_data_include: "render_surface")
30
28
  graphql_script = graphql_script.first if graphql_script.class == Array
31
29
 
32
30
  if graphql_script.nil?
@@ -34,8 +32,13 @@ module Zorki
34
32
  end
35
33
  rescue Zorki::ContentUnavailableError => e
36
34
  count += 1
35
+
36
+ if count > 3
37
+ raise Zorki::UserScrapingError.new("Zorki could not find user #{username}", additional_data: { username: username })
38
+ end
39
+
37
40
  page.driver.browser.navigate.to("https://www.instagram.com") # we want to go back to the main page so we start from scratch
38
- sleep 10
41
+ sleep rand(5..10)
39
42
  next
40
43
  end
41
44
 
data/lib/zorki/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Zorki
4
- VERSION = "0.1.26"
4
+ VERSION = "0.1.27"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: zorki
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.26
4
+ version: 0.1.27
5
5
  platform: ruby
6
6
  authors:
7
7
  - Christopher Guess
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-09-16 00:00:00.000000000 Z
11
+ date: 2024-10-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: capybara