zorki 0.1.23 → 0.1.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +50 -46
- data/lib/zorki/scrapers/scraper.rb +7 -2
- data/lib/zorki/scrapers/user_scraper.rb +29 -5
- data/lib/zorki/version.rb +1 -1
- data/lib/zorki.rb +12 -1
- data/zorki.gemspec +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ec1dba1c80d66f33d1e50fc104511a5842091529c6d2b1e3f4758508ae37468a
|
4
|
+
data.tar.gz: b510d48cbc12d0e9b70fb17fb91aeffdad51d85f85e5813695df1c86279894db
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6463fc59b818e21d4c515212d04549b7a3b925433d7e9df79ff376a786f74e028ab99146b8422c64384470443d88835854717e3a19a7c68bb3a767e7277f1c39
|
7
|
+
data.tar.gz: 173b570bb5eb62d759a488c21aff0092c633db1f0f1a4fa771e46ad32141a2016386f698e724cbc1a7dced5ad66a9d7b22add8f7dd533d8dc88ec009ca9a2814
|
data/Gemfile.lock
CHANGED
@@ -1,35 +1,36 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
zorki (0.1.
|
4
|
+
zorki (0.1.25)
|
5
5
|
apparition
|
6
6
|
capybara
|
7
7
|
oj
|
8
8
|
selenium-devtools
|
9
|
-
selenium-webdriver (~> 4.
|
9
|
+
selenium-webdriver (~> 4.24.0)
|
10
10
|
typhoeus
|
11
11
|
|
12
12
|
GEM
|
13
13
|
remote: https://rubygems.org/
|
14
14
|
specs:
|
15
|
-
activesupport (7.1
|
15
|
+
activesupport (7.2.1)
|
16
16
|
base64
|
17
17
|
bigdecimal
|
18
|
-
concurrent-ruby (~> 1.0, >= 1.
|
18
|
+
concurrent-ruby (~> 1.0, >= 1.3.1)
|
19
19
|
connection_pool (>= 2.2.5)
|
20
20
|
drb
|
21
21
|
i18n (>= 1.6, < 2)
|
22
|
+
logger (>= 1.4.2)
|
22
23
|
minitest (>= 5.1)
|
23
|
-
|
24
|
-
tzinfo (~> 2.0)
|
25
|
-
addressable (2.8.
|
26
|
-
public_suffix (>= 2.0.2, <
|
24
|
+
securerandom (>= 0.3)
|
25
|
+
tzinfo (~> 2.0, >= 2.0.5)
|
26
|
+
addressable (2.8.7)
|
27
|
+
public_suffix (>= 2.0.2, < 7.0)
|
27
28
|
apparition (0.6.0)
|
28
29
|
capybara (~> 3.13, < 4)
|
29
30
|
websocket-driver (>= 0.6.5)
|
30
31
|
ast (2.4.2)
|
31
32
|
base64 (0.2.0)
|
32
|
-
bigdecimal (3.1.
|
33
|
+
bigdecimal (3.1.8)
|
33
34
|
capybara (3.40.0)
|
34
35
|
addressable
|
35
36
|
matrix
|
@@ -39,75 +40,76 @@ GEM
|
|
39
40
|
rack-test (>= 0.6.3)
|
40
41
|
regexp_parser (>= 1.5, < 3.0)
|
41
42
|
xpath (~> 3.2)
|
42
|
-
concurrent-ruby (1.
|
43
|
+
concurrent-ruby (1.3.4)
|
43
44
|
connection_pool (2.4.1)
|
44
|
-
curb (1.0.
|
45
|
-
debug (1.9.
|
45
|
+
curb (1.0.6)
|
46
|
+
debug (1.9.2)
|
46
47
|
irb (~> 1.10)
|
47
48
|
reline (>= 0.3.8)
|
48
49
|
dotenv (2.7.6)
|
49
50
|
drb (2.2.1)
|
50
51
|
ethon (0.16.0)
|
51
52
|
ffi (>= 1.15.0)
|
52
|
-
ffi (1.
|
53
|
-
i18n (1.14.
|
53
|
+
ffi (1.17.0-arm64-darwin)
|
54
|
+
i18n (1.14.5)
|
54
55
|
concurrent-ruby (~> 1.0)
|
55
56
|
io-console (0.7.2)
|
56
|
-
irb (1.
|
57
|
-
rdoc
|
57
|
+
irb (1.14.0)
|
58
|
+
rdoc (>= 4.0.0)
|
58
59
|
reline (>= 0.4.2)
|
59
|
-
json (2.7.
|
60
|
+
json (2.7.2)
|
60
61
|
language_server-protocol (3.17.0.3)
|
62
|
+
logger (1.6.1)
|
61
63
|
matrix (0.4.2)
|
62
64
|
mini_mime (1.1.5)
|
63
|
-
minitest (5.
|
64
|
-
|
65
|
-
nokogiri (1.16.3-arm64-darwin)
|
65
|
+
minitest (5.25.1)
|
66
|
+
nokogiri (1.16.7-arm64-darwin)
|
66
67
|
racc (~> 1.4)
|
67
|
-
oj (3.16.
|
68
|
+
oj (3.16.6)
|
68
69
|
bigdecimal (>= 3.0)
|
69
|
-
|
70
|
-
|
70
|
+
ostruct (>= 0.2)
|
71
|
+
ostruct (0.6.0)
|
72
|
+
parallel (1.26.3)
|
73
|
+
parser (3.3.5.0)
|
71
74
|
ast (~> 2.4.1)
|
72
75
|
racc
|
73
76
|
psych (5.1.2)
|
74
77
|
stringio
|
75
|
-
public_suffix (
|
76
|
-
racc (1.
|
78
|
+
public_suffix (6.0.1)
|
79
|
+
racc (1.8.1)
|
77
80
|
rack (3.0.8)
|
78
81
|
rack-test (2.1.0)
|
79
82
|
rack (>= 1.3)
|
80
83
|
rainbow (3.1.1)
|
81
|
-
rake (13.1
|
82
|
-
rdoc (6.
|
84
|
+
rake (13.2.1)
|
85
|
+
rdoc (6.7.0)
|
83
86
|
psych (>= 4.0.0)
|
84
|
-
regexp_parser (2.9.
|
85
|
-
reline (0.5.
|
87
|
+
regexp_parser (2.9.2)
|
88
|
+
reline (0.5.10)
|
86
89
|
io-console (~> 0.5)
|
87
|
-
rexml (3.
|
88
|
-
rubocop (1.
|
90
|
+
rexml (3.3.7)
|
91
|
+
rubocop (1.66.1)
|
89
92
|
json (~> 2.3)
|
90
93
|
language_server-protocol (>= 3.17.0)
|
91
94
|
parallel (~> 1.10)
|
92
95
|
parser (>= 3.3.0.2)
|
93
96
|
rainbow (>= 2.2.2, < 4.0)
|
94
|
-
regexp_parser (>=
|
95
|
-
|
96
|
-
rubocop-ast (>= 1.31.1, < 2.0)
|
97
|
+
regexp_parser (>= 2.4, < 3.0)
|
98
|
+
rubocop-ast (>= 1.32.2, < 2.0)
|
97
99
|
ruby-progressbar (~> 1.7)
|
98
100
|
unicode-display_width (>= 2.4.0, < 3.0)
|
99
|
-
rubocop-ast (1.
|
100
|
-
parser (>= 3.3.0
|
101
|
-
rubocop-md (1.2.
|
102
|
-
rubocop (>= 1.
|
103
|
-
rubocop-minitest (0.
|
101
|
+
rubocop-ast (1.32.3)
|
102
|
+
parser (>= 3.3.1.0)
|
103
|
+
rubocop-md (1.2.3)
|
104
|
+
rubocop (>= 1.45)
|
105
|
+
rubocop-minitest (0.36.0)
|
104
106
|
rubocop (>= 1.61, < 2.0)
|
105
107
|
rubocop-ast (>= 1.31.1, < 2.0)
|
106
108
|
rubocop-packaging (0.5.2)
|
107
109
|
rubocop (>= 1.33, < 2.0)
|
108
|
-
rubocop-performance (1.
|
110
|
+
rubocop-performance (1.21.1)
|
109
111
|
rubocop (>= 1.48.1, < 2.0)
|
110
|
-
rubocop-ast (>= 1.
|
112
|
+
rubocop-ast (>= 1.31.1, < 2.0)
|
111
113
|
rubocop-rails (2.19.1)
|
112
114
|
activesupport (>= 4.2.0)
|
113
115
|
rack (>= 1.1)
|
@@ -122,20 +124,22 @@ GEM
|
|
122
124
|
rubocop-rails (~> 2.0)
|
123
125
|
ruby-progressbar (1.13.0)
|
124
126
|
rubyzip (2.3.2)
|
125
|
-
|
127
|
+
securerandom (0.3.1)
|
128
|
+
selenium-devtools (0.128.0)
|
126
129
|
selenium-webdriver (~> 4.2)
|
127
|
-
selenium-webdriver (4.
|
130
|
+
selenium-webdriver (4.24.0)
|
128
131
|
base64 (~> 0.2)
|
132
|
+
logger (~> 1.4)
|
129
133
|
rexml (~> 3.2, >= 3.2.5)
|
130
134
|
rubyzip (>= 1.2.2, < 3.0)
|
131
135
|
websocket (~> 1.0)
|
132
|
-
stringio (3.1.
|
136
|
+
stringio (3.1.1)
|
133
137
|
typhoeus (1.4.1)
|
134
138
|
ethon (>= 0.9.0)
|
135
139
|
tzinfo (2.0.6)
|
136
140
|
concurrent-ruby (~> 1.0)
|
137
|
-
unicode-display_width (2.
|
138
|
-
websocket (1.2.
|
141
|
+
unicode-display_width (2.6.0)
|
142
|
+
websocket (1.2.11)
|
139
143
|
websocket-driver (0.7.6)
|
140
144
|
websocket-extensions (>= 0.1.0)
|
141
145
|
websocket-extensions (0.1.5)
|
@@ -78,6 +78,8 @@ module Zorki
|
|
78
78
|
unless additional_search_parameters.nil?
|
79
79
|
body_to_check = Oj.load(response.body)
|
80
80
|
|
81
|
+
debugger if body_to_check.include?("jokoy.komi.io")
|
82
|
+
|
81
83
|
search_parameters = additional_search_parameters.split(",")
|
82
84
|
search_parameters.each_with_index do |key, index|
|
83
85
|
break if body_to_check.nil?
|
@@ -120,11 +122,12 @@ module Zorki
|
|
120
122
|
begin
|
121
123
|
element_json = JSON.parse(element.text)
|
122
124
|
|
123
|
-
if element.text.include?("
|
125
|
+
if element.text.include?("jokoy.komi.io")
|
126
|
+
debugger
|
124
127
|
# if element_json["require"].first.last.first["__bbox"].key?("require")
|
125
128
|
|
126
129
|
# element_json["require"].first.last.first["__bbox"]["require"].each do |x|
|
127
|
-
# debugger if x.to_s.include?("
|
130
|
+
# debugger if x.to_s.include?("Si mulut pelaut")
|
128
131
|
# end
|
129
132
|
# end
|
130
133
|
end
|
@@ -138,12 +141,14 @@ module Zorki
|
|
138
141
|
end
|
139
142
|
|
140
143
|
if elements&.empty?
|
144
|
+
# debugger
|
141
145
|
raise ContentUnavailableError.new("Cannot find anything", additional_data: { page_source: page.driver.browser.page_source, elements: elements })
|
142
146
|
end
|
143
147
|
|
144
148
|
return elements
|
145
149
|
end
|
146
150
|
|
151
|
+
# debugger if response_body.nil?
|
147
152
|
raise ContentUnavailableError.new("Response body nil") if response_body.nil?
|
148
153
|
Oj.load(response_body)
|
149
154
|
ensure
|
@@ -16,13 +16,30 @@ module Zorki
|
|
16
16
|
# - *description
|
17
17
|
# - *links
|
18
18
|
# - *Profile image
|
19
|
-
login
|
20
19
|
|
21
|
-
graphql_script =
|
22
|
-
|
20
|
+
graphql_script = nil
|
21
|
+
count = 0
|
22
|
+
loop do
|
23
|
+
raise Zorki::UserScrapingError.new("Zorki could not find user #{username}", additional_data: { username: username }) if count > 3
|
23
24
|
|
24
|
-
|
25
|
-
|
25
|
+
print "Scraping user #{username}... (attempt #{count + 1})\n"
|
26
|
+
begin
|
27
|
+
login
|
28
|
+
|
29
|
+
graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql", "data,user,media_count")
|
30
|
+
graphql_script = graphql_script.first if graphql_script.class == Array
|
31
|
+
|
32
|
+
if graphql_script.nil?
|
33
|
+
graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "web_profile_info")
|
34
|
+
end
|
35
|
+
rescue Zorki::ContentUnavailableError => e
|
36
|
+
count += 1
|
37
|
+
page.driver.browser.navigate.to("https://www.instagram.com") # we want to go back to the main page so we start from scratch
|
38
|
+
sleep 10
|
39
|
+
next
|
40
|
+
end
|
41
|
+
|
42
|
+
break
|
26
43
|
end
|
27
44
|
|
28
45
|
if graphql_script.has_key?("author") && !graphql_script["author"].nil?
|
@@ -36,10 +53,14 @@ module Zorki
|
|
36
53
|
["https://schema.org/FilmAction", "http://schema.org/WriteAction"].include?(stat["interactionType"])
|
37
54
|
end.first
|
38
55
|
|
56
|
+
# number_of_posts = graphql_script["data"]["user"]["media_count"] if number_of_posts.nil?
|
57
|
+
|
39
58
|
number_of_followers = graphql_script["interactionStatistic"].select do |stat|
|
40
59
|
stat["interactionType"] == "http://schema.org/FollowAction"
|
41
60
|
end.first
|
42
61
|
|
62
|
+
# number_of_followers = graphql_script["data"]["user"]["follower_count"] if number_of_followers.nil?
|
63
|
+
|
43
64
|
begin
|
44
65
|
profile_image_url = user["image"]
|
45
66
|
{
|
@@ -76,6 +97,9 @@ module Zorki
|
|
76
97
|
profile_image_url: profile_image_url
|
77
98
|
}
|
78
99
|
end
|
100
|
+
rescue Zorki::ContentUnavailableError => e
|
101
|
+
debugger
|
102
|
+
raise Zorki::UserScrapingError.new("Zorki could not find user #{username}", additional_data: { username: username })
|
79
103
|
end
|
80
104
|
end
|
81
105
|
end
|
data/lib/zorki/version.rb
CHANGED
data/lib/zorki.rb
CHANGED
@@ -28,7 +28,18 @@ module Zorki
|
|
28
28
|
end
|
29
29
|
|
30
30
|
def to_honeybadger_context
|
31
|
-
additional_data
|
31
|
+
@additional_data
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
class UserScrapingError < Error
|
36
|
+
def initialize(msg = "Zorki encountered an error scraping a user", additional_data: {})
|
37
|
+
super(msg)
|
38
|
+
@additional_data = additional_data
|
39
|
+
end
|
40
|
+
|
41
|
+
def to_honeybadger_context
|
42
|
+
@additional_data
|
32
43
|
end
|
33
44
|
end
|
34
45
|
|
data/zorki.gemspec
CHANGED
@@ -33,7 +33,7 @@ Gem::Specification.new do |spec|
|
|
33
33
|
spec.add_dependency "apparition" # A Chrome driver for Capybara
|
34
34
|
spec.add_dependency "typhoeus" # For making API requests
|
35
35
|
spec.add_dependency "oj" # A faster JSON parser/loader than stdlib
|
36
|
-
spec.add_dependency "selenium-webdriver", "~> 4.
|
36
|
+
spec.add_dependency "selenium-webdriver", "~> 4.24.0" # Webdriver selenium
|
37
37
|
spec.add_dependency "selenium-devtools" # Allow us to intercept requests
|
38
38
|
|
39
39
|
spec.add_development_dependency "debug"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zorki
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.26
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Christopher Guess
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-09-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|
@@ -72,14 +72,14 @@ dependencies:
|
|
72
72
|
requirements:
|
73
73
|
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version:
|
75
|
+
version: 4.24.0
|
76
76
|
type: :runtime
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version:
|
82
|
+
version: 4.24.0
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
name: selenium-devtools
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|