zorki 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/workflows/main.yml +18 -0
- data/.gitignore +12 -0
- data/.rubocop.yml +67 -0
- data/CHANGELOG.md +5 -0
- data/CODE_OF_CONDUCT.md +84 -0
- data/Gemfile +17 -0
- data/Gemfile.lock +162 -0
- data/LICENSE.txt +21 -0
- data/README.md +85 -0
- data/Rakefile +16 -0
- data/bin/console +15 -0
- data/bin/setup +8 -0
- data/lib/generators/zorki.rb +3 -0
- data/lib/generators/zorki_generator.rb +6 -0
- data/lib/helpers/configuration.rb +28 -0
- data/lib/zorki/monkeypatch.rb +50 -0
- data/lib/zorki/post.rb +46 -0
- data/lib/zorki/scrapers/post_scraper.rb +125 -0
- data/lib/zorki/scrapers/scraper.rb +227 -0
- data/lib/zorki/scrapers/user_scraper.rb +74 -0
- data/lib/zorki/user.rb +52 -0
- data/lib/zorki/version.rb +5 -0
- data/lib/zorki.rb +74 -0
- data/zorki.gemspec +43 -0
- data/zorki.logs +300 -0
- metadata +152 -0
@@ -0,0 +1,125 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "typhoeus"
|
4
|
+
|
5
|
+
module Zorki
|
6
|
+
class PostScraper < Scraper
|
7
|
+
def parse(id)
|
8
|
+
# Stuff we need to get from the DOM (implemented is starred):
|
9
|
+
# - User *
|
10
|
+
# - Text *
|
11
|
+
# - Image * / Images * / Video *
|
12
|
+
# - Date *
|
13
|
+
# - Number of likes *
|
14
|
+
# - Hashtags
|
15
|
+
|
16
|
+
Capybara.app_host = "https://instagram.com"
|
17
|
+
|
18
|
+
# video slideshows https://www.instagram.com/p/CY7KxwYOFBS/?utm_source=ig_embed&utm_campaign=loading
|
19
|
+
login
|
20
|
+
graphql_object = get_content_of_subpage_from_url(
|
21
|
+
"https://www.instagram.com/p/#{id}/",
|
22
|
+
"/graphql",
|
23
|
+
"data,xdt_api__v1__media__shortcode__web_info,items"
|
24
|
+
)
|
25
|
+
|
26
|
+
# For pages that have been marked misinfo the structure is very different than not
|
27
|
+
# If it is a clean post then it's just a schema.org thing, but if it's misinfo it's the old
|
28
|
+
# way of deeply nested stuff.
|
29
|
+
#
|
30
|
+
# First we check which one we're getting
|
31
|
+
|
32
|
+
if graphql_object.has_key?("articleBody")
|
33
|
+
# Let's just parse the images first
|
34
|
+
images = graphql_object["image"].map do |image|
|
35
|
+
Zorki.retrieve_media(image["url"])
|
36
|
+
end
|
37
|
+
|
38
|
+
text = graphql_object["articleBody"]
|
39
|
+
username = graphql_object["author"]["identifier"]["value"]
|
40
|
+
# 2021-04-01T17:07:10-07:00
|
41
|
+
date = DateTime.strptime(graphql_object["dateCreated"], "%Y-%m-%dT%H:%M:%S%z")
|
42
|
+
interactions = graphql_object["interactionStatistic"]
|
43
|
+
number_of_likes = interactions.select do |x|
|
44
|
+
x["interactionType"] == "http://schema.org/LikeAction"
|
45
|
+
end.first["userInteractionCount"]
|
46
|
+
|
47
|
+
unless graphql_object["video"].empty?
|
48
|
+
video = graphql_object["video"].first["contentUrl"]
|
49
|
+
video_preview_image = graphql_object["video"].first["thumbnailUrl"]
|
50
|
+
end
|
51
|
+
else
|
52
|
+
# We need to see if this is a single image post or a slideshow. We do that
|
53
|
+
# by looking for a single image, if it's not there, we assume the alternative.
|
54
|
+
graphql_object = graphql_object["data"]["xdt_api__v1__media__shortcode__web_info"]
|
55
|
+
|
56
|
+
|
57
|
+
unless graphql_object["items"][0].has_key?("video_versions") && !graphql_object["items"][0]["video_versions"].nil?
|
58
|
+
# Check if there is a slideshow or not
|
59
|
+
unless graphql_object["items"][0].has_key?("carousel_media") && !graphql_object["items"][0]["carousel_media"].nil?
|
60
|
+
# Single image
|
61
|
+
image_url = graphql_object["items"][0]["image_versions2"]["candidates"][0]["url"]
|
62
|
+
images = [Zorki.retrieve_media(image_url)]
|
63
|
+
else
|
64
|
+
# Slideshow
|
65
|
+
images = graphql_object["items"][0]["carousel_media"].map do |media|
|
66
|
+
Zorki.retrieve_media(media["image_versions2"]["candidates"][0]["url"])
|
67
|
+
end
|
68
|
+
end
|
69
|
+
else
|
70
|
+
# some of these I've seen in both ways, thus the commented out lines
|
71
|
+
# video_url = graphql_object["entry_data"]["PostPage"].first["graphql"]["shortcode_media"]["video_url"]
|
72
|
+
video_url = graphql_object["items"][0]["video_versions"][0]["url"]
|
73
|
+
video = Zorki.retrieve_media(video_url)
|
74
|
+
# video_preview_image_url = graphql_object["entry_data"]["PostPage"].first["graphql"]["shortcode_media"]["display_resources"].last["src"]
|
75
|
+
video_preview_image_url = graphql_object["items"][0]["image_versions2"]["candidates"][0]["url"]
|
76
|
+
video_preview_image = Zorki.retrieve_media(video_preview_image_url)
|
77
|
+
end
|
78
|
+
|
79
|
+
unless graphql_object["items"][0]["caption"].nil?
|
80
|
+
text = graphql_object["items"][0]["caption"]["text"]
|
81
|
+
else
|
82
|
+
text = ""
|
83
|
+
end
|
84
|
+
|
85
|
+
username = graphql_object["items"][0]["user"]["username"]
|
86
|
+
|
87
|
+
date = DateTime.strptime(graphql_object["items"][0]["taken_at"].to_s, "%s")
|
88
|
+
number_of_likes = graphql_object["items"][0]["like_count"]
|
89
|
+
end
|
90
|
+
|
91
|
+
screenshot_file = take_screenshot()
|
92
|
+
|
93
|
+
# This has to run last since it switches pages
|
94
|
+
user = User.lookup([username]).first
|
95
|
+
page.quit
|
96
|
+
|
97
|
+
{
|
98
|
+
images: images,
|
99
|
+
video: video,
|
100
|
+
video_preview_image: video_preview_image,
|
101
|
+
screenshot_file: screenshot_file,
|
102
|
+
text: text,
|
103
|
+
date: date,
|
104
|
+
number_of_likes: number_of_likes,
|
105
|
+
user: user,
|
106
|
+
id: id
|
107
|
+
}
|
108
|
+
end
|
109
|
+
|
110
|
+
def take_screenshot
|
111
|
+
# First check if a post has a fact check overlay, if so, clear it.
|
112
|
+
# The only issue is that this can take *awhile* to search. Not sure what to do about that
|
113
|
+
# since it's Instagram's fault for having such a fucked up obfuscated hierarchy
|
114
|
+
begin
|
115
|
+
find_button("See Post").click
|
116
|
+
sleep(0.1)
|
117
|
+
rescue Capybara::ElementNotFound
|
118
|
+
# Do nothing if the element is not found
|
119
|
+
end
|
120
|
+
|
121
|
+
# Take the screenshot and return it
|
122
|
+
save_screenshot("#{Zorki.temp_storage_location}/instagram_screenshot_#{SecureRandom.uuid}.png")
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
@@ -0,0 +1,227 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "capybara/dsl"
|
4
|
+
require "dotenv/load"
|
5
|
+
require "oj"
|
6
|
+
require "selenium-webdriver"
|
7
|
+
require "logger"
|
8
|
+
require "debug"
|
9
|
+
require "securerandom"
|
10
|
+
|
11
|
+
# 2022-06-07 14:15:23 WARN Selenium [DEPRECATION] [:browser_options] :options as a parameter for driver initialization is deprecated. Use :capabilities with an Array of value capabilities/options if necessary instead.
|
12
|
+
|
13
|
+
options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
|
14
|
+
options.add_argument("--start-maximized")
|
15
|
+
options.add_argument("--no-sandbox")
|
16
|
+
options.add_argument("--disable-dev-shm-usage")
|
17
|
+
options.add_argument("–-disable-blink-features=AutomationControlled")
|
18
|
+
options.add_argument("--disable-extensions")
|
19
|
+
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
|
20
|
+
options.add_preference "password_manager_enabled", false
|
21
|
+
options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
|
22
|
+
|
23
|
+
Capybara.register_driver :selenium_zorki do |app|
|
24
|
+
client = Selenium::WebDriver::Remote::Http::Default.new
|
25
|
+
client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
26
|
+
Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
|
27
|
+
end
|
28
|
+
|
29
|
+
Capybara.threadsafe = true
|
30
|
+
Capybara.default_max_wait_time = 60
|
31
|
+
Capybara.reuse_server = true
|
32
|
+
|
33
|
+
module Zorki
|
34
|
+
class Scraper # rubocop:disable Metrics/ClassLength
|
35
|
+
include Capybara::DSL
|
36
|
+
|
37
|
+
@@logger = Logger.new(STDOUT)
|
38
|
+
@@logger.level = Logger::WARN
|
39
|
+
@@logger.datetime_format = "%Y-%m-%d %H:%M:%S"
|
40
|
+
@@session_id = nil
|
41
|
+
|
42
|
+
def initialize
|
43
|
+
Capybara.default_driver = :selenium_zorki
|
44
|
+
end
|
45
|
+
|
46
|
+
# Instagram uses GraphQL (like most of Facebook I think), and returns an object that actually
|
47
|
+
# is used to seed the page. We can just parse this for most things.
|
48
|
+
#
|
49
|
+
# additional_search_params is a comma seperated keys
|
50
|
+
# example: `data,xdt_api__v1__media__shortcode__web_info,items`
|
51
|
+
#
|
52
|
+
# @returns Hash a ruby hash of the JSON data
|
53
|
+
def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil)
|
54
|
+
# So this is fun:
|
55
|
+
# For pages marked as misinformation we have to use one method (interception of requrest) and
|
56
|
+
# for pages that are not, we can just pull the data straight from the page.
|
57
|
+
#
|
58
|
+
# How do we figure out which is which?... for now we'll just run through both and see where we
|
59
|
+
# go with it.
|
60
|
+
|
61
|
+
# Our user data no longer lives in the graphql object passed initially with the page.
|
62
|
+
# Instead it comes in as part of a subsequent call. This intercepts all calls, checks if it's
|
63
|
+
# the one we want, and then moves on.
|
64
|
+
response_body = nil
|
65
|
+
|
66
|
+
page.driver.browser.intercept do |request, &continue|
|
67
|
+
# This passes the request forward unmodified, since we only care about the response
|
68
|
+
# puts "checking request: #{request.url}"
|
69
|
+
|
70
|
+
continue.call(request) && next unless request.url.include?(subpage_search)
|
71
|
+
|
72
|
+
continue.call(request) do |response|
|
73
|
+
# Check if not a CORS prefetch and finish up if not
|
74
|
+
if response.body.present?
|
75
|
+
check_passed = true
|
76
|
+
|
77
|
+
unless additional_search_parameters.nil?
|
78
|
+
body_to_check = Oj.load(response.body)
|
79
|
+
|
80
|
+
search_parameters = additional_search_parameters.split(",")
|
81
|
+
search_parameters.each_with_index do |key, index|
|
82
|
+
break if body_to_check.nil?
|
83
|
+
|
84
|
+
check_passed = false unless body_to_check.has_key?(key)
|
85
|
+
body_to_check = body_to_check[key]
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
response_body = response.body if check_passed == true
|
90
|
+
end
|
91
|
+
end
|
92
|
+
rescue Selenium::WebDriver::Error::WebDriverError
|
93
|
+
# Eat them
|
94
|
+
end
|
95
|
+
|
96
|
+
# Now that the intercept is set up, we visit the page we want
|
97
|
+
visit(url)
|
98
|
+
# We wait until the correct intercept is processed or we've waited 60 seconds
|
99
|
+
start_time = Time.now
|
100
|
+
# puts "Waiting.... #{url}"
|
101
|
+
|
102
|
+
sleep(rand(1...10))
|
103
|
+
while response_body.nil? && (Time.now - start_time) < 60
|
104
|
+
sleep(0.1)
|
105
|
+
end
|
106
|
+
|
107
|
+
page.driver.execute_script("window.stop();")
|
108
|
+
|
109
|
+
# If this is a page that has not been marked as misinfo we can just pull the data
|
110
|
+
# TODO: put this before the whole load loop
|
111
|
+
if response_body.nil?
|
112
|
+
doc = Nokogiri::HTML(page.driver.browser.page_source)
|
113
|
+
elements = doc.search("script").find_all do |e|
|
114
|
+
e.attributes.has_key?("type") && e.attributes["type"].value == "application/ld+json"
|
115
|
+
end
|
116
|
+
|
117
|
+
raise ContentUnavailableError if elements&.empty?
|
118
|
+
return Oj.load(elements.first.text)
|
119
|
+
end
|
120
|
+
|
121
|
+
raise ContentUnavailableError if response_body.nil?
|
122
|
+
Oj.load(response_body)
|
123
|
+
end
|
124
|
+
|
125
|
+
private
|
126
|
+
|
127
|
+
##########
|
128
|
+
# Set the session to use a new user folder in the options!
|
129
|
+
# #####################
|
130
|
+
def reset_selenium
|
131
|
+
options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
|
132
|
+
options.add_argument("--start-maximized")
|
133
|
+
options.add_argument("--no-sandbox")
|
134
|
+
options.add_argument("--disable-dev-shm-usage")
|
135
|
+
options.add_argument("–-disable-blink-features=AutomationControlled")
|
136
|
+
options.add_argument("--disable-extensions")
|
137
|
+
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
|
138
|
+
options.add_preference "password_manager_enabled", false
|
139
|
+
options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
|
140
|
+
# options.add_argument("--user-data-dir=/tmp/tarun")
|
141
|
+
|
142
|
+
Capybara.register_driver :selenium do |app|
|
143
|
+
client = Selenium::WebDriver::Remote::Http::Default.new
|
144
|
+
client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
145
|
+
Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
|
146
|
+
end
|
147
|
+
|
148
|
+
Capybara.current_driver = :selenium
|
149
|
+
end
|
150
|
+
|
151
|
+
def login
|
152
|
+
# Reset the sessions so that there's nothing laying around
|
153
|
+
page.quit
|
154
|
+
|
155
|
+
# Check if we're on a Instagram page already, if not visit it.
|
156
|
+
visit ("https://instagram.com") unless page.driver.browser.current_url.include? "instagram.com"
|
157
|
+
|
158
|
+
# We don't have to login if we already are
|
159
|
+
begin
|
160
|
+
return if find_field("Search", wait: 10).present?
|
161
|
+
rescue Capybara::ElementNotFound; end
|
162
|
+
|
163
|
+
# Check if we're redirected to a login page, if we aren't we're already logged in
|
164
|
+
return unless page.has_xpath?('//*[@id="loginForm"]/div/div[3]/button')
|
165
|
+
|
166
|
+
# Try to log in
|
167
|
+
loop_count = 0
|
168
|
+
while loop_count < 5 do
|
169
|
+
fill_in("username", with: ENV["INSTAGRAM_USER_NAME"])
|
170
|
+
fill_in("password", with: ENV["INSTAGRAM_PASSWORD"])
|
171
|
+
|
172
|
+
begin
|
173
|
+
click_button("Log in", exact_text: true) # Note: "Log in" (lowercase `in`) instead redirects to Facebook's login page
|
174
|
+
rescue Capybara::ElementNotFound; end # If we can't find it don't break horribly, just keep waiting
|
175
|
+
|
176
|
+
break unless has_css?('p[data-testid="login-error-message"', wait: 10)
|
177
|
+
loop_count += 1
|
178
|
+
sleep(rand * 10.3)
|
179
|
+
end
|
180
|
+
|
181
|
+
# Sometimes Instagram just... doesn't let you log in
|
182
|
+
raise "Instagram not accessible" if loop_count == 5
|
183
|
+
|
184
|
+
# No we don't want to save our login credentials
|
185
|
+
begin
|
186
|
+
click_on("Save Info")
|
187
|
+
rescue Capybara::ElementNotFound; end
|
188
|
+
end
|
189
|
+
|
190
|
+
def fetch_image(url)
|
191
|
+
request = Typhoeus::Request.new(url, followlocation: true)
|
192
|
+
request.on_complete do |response|
|
193
|
+
if request.success?
|
194
|
+
return request.body
|
195
|
+
elsif request.timed_out?
|
196
|
+
raise Zorki::Error("Fetching image at #{url} timed out")
|
197
|
+
else
|
198
|
+
raise Zorki::Error("Fetching image at #{url} returned non-successful HTTP server response #{request.code}")
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
# Convert a string to an integer
|
204
|
+
def number_string_to_integer(number_string)
|
205
|
+
# First we have to remove any commas in the number or else it all breaks
|
206
|
+
number_string = number_string.delete(",")
|
207
|
+
# Is the last digit not a number? If so, we're going to have to multiply it by some multiplier
|
208
|
+
should_expand = /[0-9]/.match(number_string[-1, 1]).nil?
|
209
|
+
|
210
|
+
# Get the last index and remove the letter at the end if we should expand
|
211
|
+
last_index = should_expand ? number_string.length - 1 : number_string.length
|
212
|
+
number = number_string[0, last_index].to_f
|
213
|
+
multiplier = 1
|
214
|
+
# Determine the multiplier depending on the letter indicated
|
215
|
+
case number_string[-1, 1]
|
216
|
+
when "m"
|
217
|
+
multiplier = 1_000_000
|
218
|
+
end
|
219
|
+
|
220
|
+
# Multiply everything and insure we get an integer back
|
221
|
+
(number * multiplier).to_i
|
222
|
+
end
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
require_relative "post_scraper"
|
227
|
+
require_relative "user_scraper"
|
@@ -0,0 +1,74 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "typhoeus"
|
4
|
+
|
5
|
+
module Zorki
|
6
|
+
class UserScraper < Scraper
|
7
|
+
def parse(username)
|
8
|
+
# Stuff we need to get from the DOM (implemented is starred):
|
9
|
+
# - *Name
|
10
|
+
# - *Username
|
11
|
+
# - *No. of posts
|
12
|
+
# - *Verified
|
13
|
+
# - *No. of followers
|
14
|
+
# - *No. of people they follow
|
15
|
+
# - *Profile
|
16
|
+
# - *description
|
17
|
+
# - *links
|
18
|
+
# - *Profile image
|
19
|
+
login
|
20
|
+
|
21
|
+
graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "?username=")
|
22
|
+
|
23
|
+
if graphql_script.has_key?("author") && !graphql_script["author"].nil?
|
24
|
+
user = graphql_script["author"]
|
25
|
+
|
26
|
+
# Get the username (to verify we're on the right page here)
|
27
|
+
scraped_username = user["identifier"]["value"]
|
28
|
+
raise Zorki::Error unless username == scraped_username
|
29
|
+
|
30
|
+
number_of_posts = graphql_script["interactionStatistic"].select do |stat|
|
31
|
+
stat["interactionType"] == "https://schema.org/FilmAction"
|
32
|
+
end.first
|
33
|
+
|
34
|
+
number_of_followers = graphql_script["interactionStatistic"].select do |stat|
|
35
|
+
stat["interactionType"] == "http://schema.org/FollowAction"
|
36
|
+
end.first
|
37
|
+
|
38
|
+
profile_image_url = user["image"]
|
39
|
+
{
|
40
|
+
name: user["name"],
|
41
|
+
username: username,
|
42
|
+
number_of_posts: Integer(number_of_posts["userInteractionCount"]),
|
43
|
+
number_of_followers: Integer(number_of_followers["userInteractionCount"]),
|
44
|
+
# number_of_following: user["edge_follow"]["count"],
|
45
|
+
verified: user["is_verified"], # todo
|
46
|
+
profile: graphql_script["description"],
|
47
|
+
profile_link: user["sameAs"],
|
48
|
+
profile_image: Zorki.retrieve_media(profile_image_url),
|
49
|
+
profile_image_url: profile_image_url
|
50
|
+
}
|
51
|
+
else
|
52
|
+
user = graphql_script["data"]["user"]
|
53
|
+
|
54
|
+
# Get the username (to verify we're on the right page here)
|
55
|
+
scraped_username = user["username"]
|
56
|
+
raise Zorki::Error unless username == scraped_username
|
57
|
+
|
58
|
+
profile_image_url = user["profile_pic_url_hd"]
|
59
|
+
{
|
60
|
+
name: user["full_name"],
|
61
|
+
username: username,
|
62
|
+
number_of_posts: user["edge_owner_to_timeline_media"]["count"],
|
63
|
+
number_of_followers: user["edge_followed_by"]["count"],
|
64
|
+
number_of_following: user["edge_follow"]["count"],
|
65
|
+
verified: user["is_verified"],
|
66
|
+
profile: user["biography"],
|
67
|
+
profile_link: user["external_url"],
|
68
|
+
profile_image: Zorki.retrieve_media(profile_image_url),
|
69
|
+
profile_image_url: profile_image_url
|
70
|
+
}
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
data/lib/zorki/user.rb
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Zorki
|
4
|
+
class User
|
5
|
+
def self.lookup(usernames = [])
|
6
|
+
# If a single id is passed in we make it the appropriate array
|
7
|
+
usernames = [usernames] unless usernames.kind_of?(Array)
|
8
|
+
|
9
|
+
# Check that the usernames are at least real usernames
|
10
|
+
# usernames.each { |id| raise Birdsong::Error if !/\A\d+\z/.match(id) }
|
11
|
+
|
12
|
+
self.scrape(usernames)
|
13
|
+
end
|
14
|
+
|
15
|
+
attr_reader :name,
|
16
|
+
:username,
|
17
|
+
:number_of_posts,
|
18
|
+
:number_of_followers,
|
19
|
+
:number_of_following,
|
20
|
+
:verified,
|
21
|
+
:profile,
|
22
|
+
:profile_link,
|
23
|
+
:profile_image,
|
24
|
+
:profile_image_url
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
def initialize(user_hash = {})
|
29
|
+
@name = user_hash[:name]
|
30
|
+
@username = user_hash[:username]
|
31
|
+
@number_of_posts = user_hash[:number_of_posts]
|
32
|
+
@number_of_followers = user_hash[:number_of_followers]
|
33
|
+
@number_of_following = user_hash[:number_of_following]
|
34
|
+
@verified = user_hash[:verified]
|
35
|
+
@profile = user_hash[:profile]
|
36
|
+
@profile_link = user_hash[:profile_link]
|
37
|
+
@profile_image = user_hash[:profile_image]
|
38
|
+
@profile_image_url = user_hash[:profile_image_url]
|
39
|
+
end
|
40
|
+
|
41
|
+
class << self
|
42
|
+
private
|
43
|
+
|
44
|
+
def scrape(usernames)
|
45
|
+
usernames.map do |username|
|
46
|
+
user_hash = Zorki::UserScraper.new.parse(username)
|
47
|
+
User.new(user_hash)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
data/lib/zorki.rb
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "zorki/version"
|
4
|
+
require_relative "zorki/monkeypatch"
|
5
|
+
|
6
|
+
# Representative objects we create
|
7
|
+
require_relative "zorki/user"
|
8
|
+
require_relative "zorki/post"
|
9
|
+
|
10
|
+
require "helpers/configuration"
|
11
|
+
require_relative "zorki/scrapers/scraper"
|
12
|
+
|
13
|
+
module Zorki
|
14
|
+
extend Configuration
|
15
|
+
|
16
|
+
class Error < StandardError
|
17
|
+
def initialize(msg = "Zorki encountered an error scraping Instagram")
|
18
|
+
super
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
class ContentUnavailableError < Error
|
23
|
+
def initialize(msg = "Zorki could not find content requested")
|
24
|
+
super
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
class RetryableError < Error; end
|
29
|
+
|
30
|
+
class ImageRequestTimedOutError < RetryableError
|
31
|
+
def initialize(msg = "Zorki encountered a timeout error requesting an image")
|
32
|
+
super
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
class ImageRequestFailedError < RetryableError
|
37
|
+
def initialize(msg = "Zorki received a non-200 response requesting an image")
|
38
|
+
super
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
define_setting :temp_storage_location, "tmp/zorki"
|
43
|
+
|
44
|
+
# Get an image from a URL and save to a temp folder set in the configuration under
|
45
|
+
# temp_storage_location
|
46
|
+
def self.retrieve_media(url)
|
47
|
+
response = Typhoeus.get(url)
|
48
|
+
|
49
|
+
# Get the file extension if it's in the file
|
50
|
+
stripped_url = url.split("?").first # remove URL query params
|
51
|
+
extension = stripped_url.split(".").last
|
52
|
+
|
53
|
+
# Do some basic checks so we just empty out if there's something weird in the file extension
|
54
|
+
# that could do some harm.
|
55
|
+
if extension.length.positive?
|
56
|
+
extension = nil unless /^[a-zA-Z0-9]+$/.match?(extension)
|
57
|
+
extension = ".#{extension}" unless extension.nil?
|
58
|
+
end
|
59
|
+
|
60
|
+
temp_file_name = "#{Zorki.temp_storage_location}/instagram_media_#{SecureRandom.uuid}#{extension}"
|
61
|
+
|
62
|
+
# We do this in case the folder isn't created yet, since it's a temp folder we'll just do so
|
63
|
+
self.create_temp_storage_location
|
64
|
+
File.binwrite(temp_file_name, response.body)
|
65
|
+
temp_file_name
|
66
|
+
end
|
67
|
+
|
68
|
+
private
|
69
|
+
|
70
|
+
def self.create_temp_storage_location
|
71
|
+
return if File.exist?(Zorki.temp_storage_location) && File.directory?(Zorki.temp_storage_location)
|
72
|
+
FileUtils.mkdir_p Zorki.temp_storage_location
|
73
|
+
end
|
74
|
+
end
|
data/zorki.gemspec
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "lib/zorki/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = "zorki"
|
7
|
+
spec.version = Zorki::VERSION
|
8
|
+
spec.authors = ["Christopher Guess"]
|
9
|
+
spec.email = ["cguess@gmail.com"]
|
10
|
+
|
11
|
+
spec.summary = "A gem to scrape Instagram pages for archive purposes."
|
12
|
+
# spec.description = "TODO: Write a longer description or delete this line."
|
13
|
+
# spec.homepage = "TODO: Put your gem's website or public repo URL here."
|
14
|
+
spec.license = "MIT"
|
15
|
+
spec.required_ruby_version = Gem::Requirement.new(">= 2.7.0")
|
16
|
+
|
17
|
+
# spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
|
18
|
+
|
19
|
+
# spec.metadata["homepage_uri"] = spec.homepage
|
20
|
+
# spec.metadata["source_code_uri"] = "TODO: Put your gem's public repo URL here."
|
21
|
+
# spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
|
22
|
+
|
23
|
+
# Specify which files should be added to the gem when it is released.
|
24
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
25
|
+
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
26
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
|
27
|
+
end
|
28
|
+
spec.bindir = "exe"
|
29
|
+
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
30
|
+
spec.require_paths = ["lib"]
|
31
|
+
|
32
|
+
# Uncomment to register a new dependency of your gem
|
33
|
+
# spec.add_dependency "example-gem", "~> 1.0"
|
34
|
+
spec.add_dependency "capybara" # For scraping and running browsers
|
35
|
+
spec.add_dependency "apparition" # A Chrome driver for Capybara
|
36
|
+
spec.add_dependency "typhoeus" # For making API requests
|
37
|
+
spec.add_dependency "oj" # A faster JSON parser/loader than stdlib
|
38
|
+
spec.add_dependency "selenium-webdriver" # Webdriver selenium
|
39
|
+
spec.add_dependency "selenium-devtools" # Allow us to intercept requests
|
40
|
+
|
41
|
+
# For more information and examples about making a new gem, checkout our
|
42
|
+
# guide at: https://bundler.io/guides/creating_gem.html
|
43
|
+
end
|