zorki 0.1.22 → 0.1.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/zorki/monkeypatch.rb +1 -1
- data/lib/zorki/scrapers/post_scraper.rb +5 -0
- data/lib/zorki/scrapers/scraper.rb +17 -3
- data/lib/zorki/scrapers/user_scraper.rb +7 -5
- data/lib/zorki/version.rb +1 -1
- data/lib/zorki.rb +11 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fa95bb5ca131ca6b7faed2aab200329579b77012cb079da75a0ad90a60daa5bd
|
4
|
+
data.tar.gz: 82eb0cc29af2cfaeafa8774e027904959f953a50faa03d71dbcbeba9595ac520
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 127638ce83ed09be71f194ba3e5dc269374890a1bd89874c8f158896a7664d41724698a258916f63e373b1c6f3dd3b4d315fac49a15e17ff457035ec4d345120
|
7
|
+
data.tar.gz: 105b7e148774a82640ed48bffe46b21d941ce0a18987f59ef0e79a3a058077fbd9dda651811bbc421ff9fcc8323de152b2ac5afbff0e5b9d4e5c65d27f5ac85c
|
data/lib/zorki/monkeypatch.rb
CHANGED
@@ -41,7 +41,7 @@ module SeleniumMonkeypatch
|
|
41
41
|
data[:sessionId] = @session_id if @session_id
|
42
42
|
message = @ws.send_cmd(**data)
|
43
43
|
if message.nil? == false && message["error"] && (method != "Fetch.continueRequest")
|
44
|
-
raise Error::WebDriverError, error_message(message["error"])
|
44
|
+
raise Selenium::WebDriver::Error::WebDriverError, error_message(message["error"])
|
45
45
|
end
|
46
46
|
|
47
47
|
message
|
@@ -23,6 +23,9 @@ module Zorki
|
|
23
23
|
raise ImageRequestZeroSize if count == 5
|
24
24
|
|
25
25
|
result
|
26
|
+
ensure
|
27
|
+
page.quit
|
28
|
+
# Make sure it's quit? I'm not sure we really want to do this outside of testing.
|
26
29
|
end
|
27
30
|
|
28
31
|
def attempt_parse(id)
|
@@ -44,6 +47,7 @@ module Zorki
|
|
44
47
|
"data,xdt_api__v1__media__shortcode__web_info,items"
|
45
48
|
)
|
46
49
|
|
50
|
+
|
47
51
|
graphql_object = graphql_object.first if graphql_object.kind_of?(Array)
|
48
52
|
|
49
53
|
# For pages that have been marked misinfo the structure is very different than not
|
@@ -61,6 +65,7 @@ module Zorki
|
|
61
65
|
text = graphql_object["articleBody"]
|
62
66
|
username = graphql_object["author"]["identifier"]["value"]
|
63
67
|
# 2021-04-01T17:07:10-07:00
|
68
|
+
|
64
69
|
date = DateTime.strptime(graphql_object["dateCreated"], "%Y-%m-%dT%H:%M:%S%z")
|
65
70
|
interactions = graphql_object["interactionStatistic"]
|
66
71
|
number_of_likes = interactions.select do |x|
|
@@ -72,11 +72,14 @@ module Zorki
|
|
72
72
|
|
73
73
|
continue.call(request) do |response|
|
74
74
|
# Check if not a CORS prefetch and finish up if not
|
75
|
-
if !response.body
|
75
|
+
if !response.body&.empty? && response.body
|
76
76
|
check_passed = true
|
77
|
+
|
77
78
|
unless additional_search_parameters.nil?
|
78
79
|
body_to_check = Oj.load(response.body)
|
79
80
|
|
81
|
+
debugger if body_to_check.include?("jokoy.komi.io")
|
82
|
+
|
80
83
|
search_parameters = additional_search_parameters.split(",")
|
81
84
|
search_parameters.each_with_index do |key, index|
|
82
85
|
break if body_to_check.nil?
|
@@ -119,7 +122,17 @@ module Zorki
|
|
119
122
|
begin
|
120
123
|
element_json = JSON.parse(element.text)
|
121
124
|
|
122
|
-
|
125
|
+
if element.text.include?("jokoy.komi.io")
|
126
|
+
debugger
|
127
|
+
# if element_json["require"].first.last.first["__bbox"].key?("require")
|
128
|
+
|
129
|
+
# element_json["require"].first.last.first["__bbox"]["require"].each do |x|
|
130
|
+
# debugger if x.to_s.include?("Si mulut pelaut")
|
131
|
+
# end
|
132
|
+
# end
|
133
|
+
end
|
134
|
+
|
135
|
+
parsed_element_json = element_json["require"].last.last.first["__bbox"]["require"].first.last.last["__bbox"]["result"]["data"]["xdt_api__v1__media__shortcode__web_info"]
|
123
136
|
rescue StandardError
|
124
137
|
next
|
125
138
|
end
|
@@ -137,9 +150,10 @@ module Zorki
|
|
137
150
|
raise ContentUnavailableError.new("Response body nil") if response_body.nil?
|
138
151
|
Oj.load(response_body)
|
139
152
|
ensure
|
140
|
-
page.quit
|
153
|
+
# page.quit
|
141
154
|
# TRY THIS TO MAKE SURE CHROME GETS CLOSED?
|
142
155
|
# We may also want to not do this and make sure the same browser is reused instead for cookie purposes
|
156
|
+
# NOW wer'e trying this 2024-05-28
|
143
157
|
end
|
144
158
|
|
145
159
|
private
|
@@ -18,7 +18,7 @@ module Zorki
|
|
18
18
|
# - *Profile image
|
19
19
|
login
|
20
20
|
|
21
|
-
graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "
|
21
|
+
graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql", "data,user,full_name")
|
22
22
|
graphql_script = graphql_script.first if graphql_script.class == Array
|
23
23
|
|
24
24
|
if graphql_script.nil?
|
@@ -62,13 +62,13 @@ module Zorki
|
|
62
62
|
scraped_username = user["username"]
|
63
63
|
raise Zorki::Error unless username == scraped_username
|
64
64
|
|
65
|
-
profile_image_url = user["
|
65
|
+
profile_image_url = user["hd_profile_pic_url_info"]["url"]
|
66
66
|
{
|
67
67
|
name: user["full_name"],
|
68
68
|
username: username,
|
69
|
-
number_of_posts: user["
|
70
|
-
number_of_followers: user["
|
71
|
-
number_of_following: user["
|
69
|
+
number_of_posts: user["media_count"],
|
70
|
+
number_of_followers: user["follower_count"],
|
71
|
+
number_of_following: user["following_count"],
|
72
72
|
verified: user["is_verified"],
|
73
73
|
profile: user["biography"],
|
74
74
|
profile_link: user["external_url"],
|
@@ -76,6 +76,8 @@ module Zorki
|
|
76
76
|
profile_image_url: profile_image_url
|
77
77
|
}
|
78
78
|
end
|
79
|
+
rescue Zorki::ContentUnavailableError
|
80
|
+
raise Zorki::UserScrapingError.new("Zorki could not find user #{username}", additional_data: { username: username })
|
79
81
|
end
|
80
82
|
end
|
81
83
|
end
|
data/lib/zorki/version.rb
CHANGED
data/lib/zorki.rb
CHANGED
@@ -32,6 +32,17 @@ module Zorki
|
|
32
32
|
end
|
33
33
|
end
|
34
34
|
|
35
|
+
class UserScrapingError < Error
|
36
|
+
def initialize(msg = "Zorki encountered an error scraping a user", additional_data: {})
|
37
|
+
super(msg)
|
38
|
+
@additional_data = additional_data
|
39
|
+
end
|
40
|
+
|
41
|
+
def to_honeybadger_context
|
42
|
+
additional_data
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
35
46
|
class RetryableError < Error; end
|
36
47
|
|
37
48
|
class ImageRequestTimedOutError < RetryableError
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zorki
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.24
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Christopher Guess
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-06-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|