zorki 0.1.20 → 0.1.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/zorki/scrapers/post_scraper.rb +22 -0
- data/lib/zorki/scrapers/scraper.rb +7 -2
- data/lib/zorki/version.rb +1 -1
- data/lib/zorki.rb +24 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '0228694f5aafe38a9856efe3a4f9f042b94aeecbd5cb7ec919326f48c0b8000e'
|
4
|
+
data.tar.gz: c54da1aa5b98d3166a047a24022b74356538e7dcb974e032bc5236065c5edd4c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 30b2953e778ce0a6e36f221350c6760974de7279c764111cfe2327f3bcc5f11a93d19a4747d20c35260038e07c06bad2c3d2fc704b4878c23a41ba309ae689d9
|
7
|
+
data.tar.gz: 99ed979bcc6f4f0758bca49d34580333ba42c35d74ce3ccfe52871acb9665d3841e0d54ed68bb3af2cf1ef366ebb1f3f9ea9136784958778751d7142f09e5b38
|
@@ -5,6 +5,28 @@ require "typhoeus"
|
|
5
5
|
module Zorki
|
6
6
|
class PostScraper < Scraper
|
7
7
|
def parse(id)
|
8
|
+
count = 0
|
9
|
+
|
10
|
+
until count == 2
|
11
|
+
puts "Retrieving ID #{id}"
|
12
|
+
|
13
|
+
begin
|
14
|
+
result = attempt_parse(id)
|
15
|
+
break
|
16
|
+
rescue ImageRequestZeroSize
|
17
|
+
debugger
|
18
|
+
# If the image is zero size, we retry
|
19
|
+
puts "Zero sized image found, retrying #{count}"
|
20
|
+
count += 1
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
raise ImageRequestZeroSize if count == 5
|
25
|
+
|
26
|
+
result
|
27
|
+
end
|
28
|
+
|
29
|
+
def attempt_parse(id)
|
8
30
|
# Stuff we need to get from the DOM (implemented is starred):
|
9
31
|
# - User *
|
10
32
|
# - Text *
|
@@ -172,6 +172,7 @@ module Zorki
|
|
172
172
|
|
173
173
|
def login
|
174
174
|
puts "Attempting to login..."
|
175
|
+
|
175
176
|
# Reset the sessions so that there's nothing laying around
|
176
177
|
# page.driver.browser.close
|
177
178
|
|
@@ -185,7 +186,10 @@ module Zorki
|
|
185
186
|
|
186
187
|
# We don't have to login if we already are
|
187
188
|
begin
|
188
|
-
|
189
|
+
if find_field("Search", wait: 10).present?
|
190
|
+
puts "Already logged in"
|
191
|
+
return
|
192
|
+
end
|
189
193
|
rescue Capybara::ElementNotFound; end
|
190
194
|
|
191
195
|
# Check if we're redirected to a login page, if we aren't we're already logged in
|
@@ -213,9 +217,10 @@ module Zorki
|
|
213
217
|
# Sometimes Instagram just... doesn't let you log in
|
214
218
|
raise "Instagram not accessible" if loop_count == 5
|
215
219
|
|
220
|
+
puts "Login successful"
|
216
221
|
# No we don't want to save our login credentials
|
217
222
|
begin
|
218
|
-
puts "Checking and clearing Save Info button
|
223
|
+
puts "Checking and clearing Save Info button"
|
219
224
|
|
220
225
|
find_button("Save Info").click()
|
221
226
|
rescue Capybara::ElementNotFound; end
|
data/lib/zorki/version.rb
CHANGED
data/lib/zorki.rb
CHANGED
@@ -46,11 +46,34 @@ module Zorki
|
|
46
46
|
end
|
47
47
|
end
|
48
48
|
|
49
|
+
class ImageRequestZeroSize < RetryableError; end
|
50
|
+
|
49
51
|
define_setting :temp_storage_location, "tmp/zorki"
|
50
52
|
|
51
53
|
# Get an image from a URL and save to a temp folder set in the configuration under
|
52
54
|
# temp_storage_location
|
55
|
+
|
56
|
+
# We do this because sometimes the images are coming back sized zero
|
53
57
|
def self.retrieve_media(url)
|
58
|
+
count = 0
|
59
|
+
|
60
|
+
until count == 5
|
61
|
+
temp_file_name = attempt_retrieve_media(url)
|
62
|
+
|
63
|
+
# If it's more than 1kb return properly
|
64
|
+
return temp_file_name if File.size(temp_file_name) > 100
|
65
|
+
|
66
|
+
# Delete the file since we want to retry
|
67
|
+
debugger
|
68
|
+
|
69
|
+
File.delete(temp_file_name)
|
70
|
+
count += 1
|
71
|
+
end
|
72
|
+
|
73
|
+
raise(ImageRequestZeroSize)
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.attempt_retrieve_media(url)
|
54
77
|
response = Typhoeus.get(url)
|
55
78
|
|
56
79
|
# Get the file extension if it's in the file
|
@@ -69,6 +92,7 @@ module Zorki
|
|
69
92
|
# We do this in case the folder isn't created yet, since it's a temp folder we'll just do so
|
70
93
|
self.create_temp_storage_location
|
71
94
|
File.binwrite(temp_file_name, response.body)
|
95
|
+
|
72
96
|
temp_file_name
|
73
97
|
end
|
74
98
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zorki
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.21
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Christopher Guess
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-03-
|
11
|
+
date: 2024-03-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|