zorki 0.1.24 → 0.1.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +50 -46
- data/lib/zorki/scrapers/scraper.rb +2 -0
- data/lib/zorki/scrapers/user_scraper.rb +28 -6
- data/lib/zorki/version.rb +1 -1
- data/lib/zorki.rb +2 -2
- data/zorki.gemspec +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ec1dba1c80d66f33d1e50fc104511a5842091529c6d2b1e3f4758508ae37468a
|
4
|
+
data.tar.gz: b510d48cbc12d0e9b70fb17fb91aeffdad51d85f85e5813695df1c86279894db
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6463fc59b818e21d4c515212d04549b7a3b925433d7e9df79ff376a786f74e028ab99146b8422c64384470443d88835854717e3a19a7c68bb3a767e7277f1c39
|
7
|
+
data.tar.gz: 173b570bb5eb62d759a488c21aff0092c633db1f0f1a4fa771e46ad32141a2016386f698e724cbc1a7dced5ad66a9d7b22add8f7dd533d8dc88ec009ca9a2814
|
data/Gemfile.lock
CHANGED
@@ -1,35 +1,36 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
zorki (0.1.
|
4
|
+
zorki (0.1.25)
|
5
5
|
apparition
|
6
6
|
capybara
|
7
7
|
oj
|
8
8
|
selenium-devtools
|
9
|
-
selenium-webdriver (~> 4.
|
9
|
+
selenium-webdriver (~> 4.24.0)
|
10
10
|
typhoeus
|
11
11
|
|
12
12
|
GEM
|
13
13
|
remote: https://rubygems.org/
|
14
14
|
specs:
|
15
|
-
activesupport (7.1
|
15
|
+
activesupport (7.2.1)
|
16
16
|
base64
|
17
17
|
bigdecimal
|
18
|
-
concurrent-ruby (~> 1.0, >= 1.
|
18
|
+
concurrent-ruby (~> 1.0, >= 1.3.1)
|
19
19
|
connection_pool (>= 2.2.5)
|
20
20
|
drb
|
21
21
|
i18n (>= 1.6, < 2)
|
22
|
+
logger (>= 1.4.2)
|
22
23
|
minitest (>= 5.1)
|
23
|
-
|
24
|
-
tzinfo (~> 2.0)
|
25
|
-
addressable (2.8.
|
26
|
-
public_suffix (>= 2.0.2, <
|
24
|
+
securerandom (>= 0.3)
|
25
|
+
tzinfo (~> 2.0, >= 2.0.5)
|
26
|
+
addressable (2.8.7)
|
27
|
+
public_suffix (>= 2.0.2, < 7.0)
|
27
28
|
apparition (0.6.0)
|
28
29
|
capybara (~> 3.13, < 4)
|
29
30
|
websocket-driver (>= 0.6.5)
|
30
31
|
ast (2.4.2)
|
31
32
|
base64 (0.2.0)
|
32
|
-
bigdecimal (3.1.
|
33
|
+
bigdecimal (3.1.8)
|
33
34
|
capybara (3.40.0)
|
34
35
|
addressable
|
35
36
|
matrix
|
@@ -39,75 +40,76 @@ GEM
|
|
39
40
|
rack-test (>= 0.6.3)
|
40
41
|
regexp_parser (>= 1.5, < 3.0)
|
41
42
|
xpath (~> 3.2)
|
42
|
-
concurrent-ruby (1.
|
43
|
+
concurrent-ruby (1.3.4)
|
43
44
|
connection_pool (2.4.1)
|
44
|
-
curb (1.0.
|
45
|
-
debug (1.9.
|
45
|
+
curb (1.0.6)
|
46
|
+
debug (1.9.2)
|
46
47
|
irb (~> 1.10)
|
47
48
|
reline (>= 0.3.8)
|
48
49
|
dotenv (2.7.6)
|
49
50
|
drb (2.2.1)
|
50
51
|
ethon (0.16.0)
|
51
52
|
ffi (>= 1.15.0)
|
52
|
-
ffi (1.
|
53
|
-
i18n (1.14.
|
53
|
+
ffi (1.17.0-arm64-darwin)
|
54
|
+
i18n (1.14.5)
|
54
55
|
concurrent-ruby (~> 1.0)
|
55
56
|
io-console (0.7.2)
|
56
|
-
irb (1.
|
57
|
-
rdoc
|
57
|
+
irb (1.14.0)
|
58
|
+
rdoc (>= 4.0.0)
|
58
59
|
reline (>= 0.4.2)
|
59
|
-
json (2.7.
|
60
|
+
json (2.7.2)
|
60
61
|
language_server-protocol (3.17.0.3)
|
62
|
+
logger (1.6.1)
|
61
63
|
matrix (0.4.2)
|
62
64
|
mini_mime (1.1.5)
|
63
|
-
minitest (5.
|
64
|
-
|
65
|
-
nokogiri (1.16.3-arm64-darwin)
|
65
|
+
minitest (5.25.1)
|
66
|
+
nokogiri (1.16.7-arm64-darwin)
|
66
67
|
racc (~> 1.4)
|
67
|
-
oj (3.16.
|
68
|
+
oj (3.16.6)
|
68
69
|
bigdecimal (>= 3.0)
|
69
|
-
|
70
|
-
|
70
|
+
ostruct (>= 0.2)
|
71
|
+
ostruct (0.6.0)
|
72
|
+
parallel (1.26.3)
|
73
|
+
parser (3.3.5.0)
|
71
74
|
ast (~> 2.4.1)
|
72
75
|
racc
|
73
76
|
psych (5.1.2)
|
74
77
|
stringio
|
75
|
-
public_suffix (
|
76
|
-
racc (1.
|
78
|
+
public_suffix (6.0.1)
|
79
|
+
racc (1.8.1)
|
77
80
|
rack (3.0.8)
|
78
81
|
rack-test (2.1.0)
|
79
82
|
rack (>= 1.3)
|
80
83
|
rainbow (3.1.1)
|
81
|
-
rake (13.1
|
82
|
-
rdoc (6.
|
84
|
+
rake (13.2.1)
|
85
|
+
rdoc (6.7.0)
|
83
86
|
psych (>= 4.0.0)
|
84
|
-
regexp_parser (2.9.
|
85
|
-
reline (0.5.
|
87
|
+
regexp_parser (2.9.2)
|
88
|
+
reline (0.5.10)
|
86
89
|
io-console (~> 0.5)
|
87
|
-
rexml (3.
|
88
|
-
rubocop (1.
|
90
|
+
rexml (3.3.7)
|
91
|
+
rubocop (1.66.1)
|
89
92
|
json (~> 2.3)
|
90
93
|
language_server-protocol (>= 3.17.0)
|
91
94
|
parallel (~> 1.10)
|
92
95
|
parser (>= 3.3.0.2)
|
93
96
|
rainbow (>= 2.2.2, < 4.0)
|
94
|
-
regexp_parser (>=
|
95
|
-
|
96
|
-
rubocop-ast (>= 1.31.1, < 2.0)
|
97
|
+
regexp_parser (>= 2.4, < 3.0)
|
98
|
+
rubocop-ast (>= 1.32.2, < 2.0)
|
97
99
|
ruby-progressbar (~> 1.7)
|
98
100
|
unicode-display_width (>= 2.4.0, < 3.0)
|
99
|
-
rubocop-ast (1.
|
100
|
-
parser (>= 3.3.0
|
101
|
-
rubocop-md (1.2.
|
102
|
-
rubocop (>= 1.
|
103
|
-
rubocop-minitest (0.
|
101
|
+
rubocop-ast (1.32.3)
|
102
|
+
parser (>= 3.3.1.0)
|
103
|
+
rubocop-md (1.2.3)
|
104
|
+
rubocop (>= 1.45)
|
105
|
+
rubocop-minitest (0.36.0)
|
104
106
|
rubocop (>= 1.61, < 2.0)
|
105
107
|
rubocop-ast (>= 1.31.1, < 2.0)
|
106
108
|
rubocop-packaging (0.5.2)
|
107
109
|
rubocop (>= 1.33, < 2.0)
|
108
|
-
rubocop-performance (1.
|
110
|
+
rubocop-performance (1.21.1)
|
109
111
|
rubocop (>= 1.48.1, < 2.0)
|
110
|
-
rubocop-ast (>= 1.
|
112
|
+
rubocop-ast (>= 1.31.1, < 2.0)
|
111
113
|
rubocop-rails (2.19.1)
|
112
114
|
activesupport (>= 4.2.0)
|
113
115
|
rack (>= 1.1)
|
@@ -122,20 +124,22 @@ GEM
|
|
122
124
|
rubocop-rails (~> 2.0)
|
123
125
|
ruby-progressbar (1.13.0)
|
124
126
|
rubyzip (2.3.2)
|
125
|
-
|
127
|
+
securerandom (0.3.1)
|
128
|
+
selenium-devtools (0.128.0)
|
126
129
|
selenium-webdriver (~> 4.2)
|
127
|
-
selenium-webdriver (4.
|
130
|
+
selenium-webdriver (4.24.0)
|
128
131
|
base64 (~> 0.2)
|
132
|
+
logger (~> 1.4)
|
129
133
|
rexml (~> 3.2, >= 3.2.5)
|
130
134
|
rubyzip (>= 1.2.2, < 3.0)
|
131
135
|
websocket (~> 1.0)
|
132
|
-
stringio (3.1.
|
136
|
+
stringio (3.1.1)
|
133
137
|
typhoeus (1.4.1)
|
134
138
|
ethon (>= 0.9.0)
|
135
139
|
tzinfo (2.0.6)
|
136
140
|
concurrent-ruby (~> 1.0)
|
137
|
-
unicode-display_width (2.
|
138
|
-
websocket (1.2.
|
141
|
+
unicode-display_width (2.6.0)
|
142
|
+
websocket (1.2.11)
|
139
143
|
websocket-driver (0.7.6)
|
140
144
|
websocket-extensions (>= 0.1.0)
|
141
145
|
websocket-extensions (0.1.5)
|
@@ -141,12 +141,14 @@ module Zorki
|
|
141
141
|
end
|
142
142
|
|
143
143
|
if elements&.empty?
|
144
|
+
# debugger
|
144
145
|
raise ContentUnavailableError.new("Cannot find anything", additional_data: { page_source: page.driver.browser.page_source, elements: elements })
|
145
146
|
end
|
146
147
|
|
147
148
|
return elements
|
148
149
|
end
|
149
150
|
|
151
|
+
# debugger if response_body.nil?
|
150
152
|
raise ContentUnavailableError.new("Response body nil") if response_body.nil?
|
151
153
|
Oj.load(response_body)
|
152
154
|
ensure
|
@@ -16,13 +16,30 @@ module Zorki
|
|
16
16
|
# - *description
|
17
17
|
# - *links
|
18
18
|
# - *Profile image
|
19
|
-
login
|
20
19
|
|
21
|
-
graphql_script =
|
22
|
-
|
20
|
+
graphql_script = nil
|
21
|
+
count = 0
|
22
|
+
loop do
|
23
|
+
raise Zorki::UserScrapingError.new("Zorki could not find user #{username}", additional_data: { username: username }) if count > 3
|
23
24
|
|
24
|
-
|
25
|
-
|
25
|
+
print "Scraping user #{username}... (attempt #{count + 1})\n"
|
26
|
+
begin
|
27
|
+
login
|
28
|
+
|
29
|
+
graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "graphql", "data,user,media_count")
|
30
|
+
graphql_script = graphql_script.first if graphql_script.class == Array
|
31
|
+
|
32
|
+
if graphql_script.nil?
|
33
|
+
graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "web_profile_info")
|
34
|
+
end
|
35
|
+
rescue Zorki::ContentUnavailableError => e
|
36
|
+
count += 1
|
37
|
+
page.driver.browser.navigate.to("https://www.instagram.com") # we want to go back to the main page so we start from scratch
|
38
|
+
sleep 10
|
39
|
+
next
|
40
|
+
end
|
41
|
+
|
42
|
+
break
|
26
43
|
end
|
27
44
|
|
28
45
|
if graphql_script.has_key?("author") && !graphql_script["author"].nil?
|
@@ -36,10 +53,14 @@ module Zorki
|
|
36
53
|
["https://schema.org/FilmAction", "http://schema.org/WriteAction"].include?(stat["interactionType"])
|
37
54
|
end.first
|
38
55
|
|
56
|
+
# number_of_posts = graphql_script["data"]["user"]["media_count"] if number_of_posts.nil?
|
57
|
+
|
39
58
|
number_of_followers = graphql_script["interactionStatistic"].select do |stat|
|
40
59
|
stat["interactionType"] == "http://schema.org/FollowAction"
|
41
60
|
end.first
|
42
61
|
|
62
|
+
# number_of_followers = graphql_script["data"]["user"]["follower_count"] if number_of_followers.nil?
|
63
|
+
|
43
64
|
begin
|
44
65
|
profile_image_url = user["image"]
|
45
66
|
{
|
@@ -76,7 +97,8 @@ module Zorki
|
|
76
97
|
profile_image_url: profile_image_url
|
77
98
|
}
|
78
99
|
end
|
79
|
-
rescue Zorki::ContentUnavailableError
|
100
|
+
rescue Zorki::ContentUnavailableError => e
|
101
|
+
debugger
|
80
102
|
raise Zorki::UserScrapingError.new("Zorki could not find user #{username}", additional_data: { username: username })
|
81
103
|
end
|
82
104
|
end
|
data/lib/zorki/version.rb
CHANGED
data/lib/zorki.rb
CHANGED
data/zorki.gemspec
CHANGED
@@ -33,7 +33,7 @@ Gem::Specification.new do |spec|
|
|
33
33
|
spec.add_dependency "apparition" # A Chrome driver for Capybara
|
34
34
|
spec.add_dependency "typhoeus" # For making API requests
|
35
35
|
spec.add_dependency "oj" # A faster JSON parser/loader than stdlib
|
36
|
-
spec.add_dependency "selenium-webdriver", "~> 4.
|
36
|
+
spec.add_dependency "selenium-webdriver", "~> 4.24.0" # Webdriver selenium
|
37
37
|
spec.add_dependency "selenium-devtools" # Allow us to intercept requests
|
38
38
|
|
39
39
|
spec.add_development_dependency "debug"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zorki
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.26
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Christopher Guess
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-09-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|
@@ -72,14 +72,14 @@ dependencies:
|
|
72
72
|
requirements:
|
73
73
|
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version:
|
75
|
+
version: 4.24.0
|
76
76
|
type: :runtime
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version:
|
82
|
+
version: 4.24.0
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
name: selenium-devtools
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|