zorki 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.github/workflows/main.yml +18 -0
- data/.gitignore +12 -0
- data/.rubocop.yml +67 -0
- data/CHANGELOG.md +5 -0
- data/CODE_OF_CONDUCT.md +84 -0
- data/Gemfile +17 -0
- data/Gemfile.lock +162 -0
- data/LICENSE.txt +21 -0
- data/README.md +85 -0
- data/Rakefile +16 -0
- data/bin/console +15 -0
- data/bin/setup +8 -0
- data/lib/generators/zorki.rb +3 -0
- data/lib/generators/zorki_generator.rb +6 -0
- data/lib/helpers/configuration.rb +28 -0
- data/lib/zorki/monkeypatch.rb +50 -0
- data/lib/zorki/post.rb +46 -0
- data/lib/zorki/scrapers/post_scraper.rb +125 -0
- data/lib/zorki/scrapers/scraper.rb +227 -0
- data/lib/zorki/scrapers/user_scraper.rb +74 -0
- data/lib/zorki/user.rb +52 -0
- data/lib/zorki/version.rb +5 -0
- data/lib/zorki.rb +74 -0
- data/zorki.gemspec +43 -0
- data/zorki.logs +300 -0
- metadata +152 -0
@@ -0,0 +1,125 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "typhoeus"
|
4
|
+
|
5
|
+
module Zorki
|
6
|
+
class PostScraper < Scraper
|
7
|
+
def parse(id)
|
8
|
+
# Stuff we need to get from the DOM (implemented is starred):
|
9
|
+
# - User *
|
10
|
+
# - Text *
|
11
|
+
# - Image * / Images * / Video *
|
12
|
+
# - Date *
|
13
|
+
# - Number of likes *
|
14
|
+
# - Hashtags
|
15
|
+
|
16
|
+
Capybara.app_host = "https://instagram.com"
|
17
|
+
|
18
|
+
# video slideshows https://www.instagram.com/p/CY7KxwYOFBS/?utm_source=ig_embed&utm_campaign=loading
|
19
|
+
login
|
20
|
+
graphql_object = get_content_of_subpage_from_url(
|
21
|
+
"https://www.instagram.com/p/#{id}/",
|
22
|
+
"/graphql",
|
23
|
+
"data,xdt_api__v1__media__shortcode__web_info,items"
|
24
|
+
)
|
25
|
+
|
26
|
+
# For pages that have been marked misinfo the structure is very different than not
|
27
|
+
# If it is a clean post then it's just a schema.org thing, but if it's misinfo it's the old
|
28
|
+
# way of deeply nested stuff.
|
29
|
+
#
|
30
|
+
# First we check which one we're getting
|
31
|
+
|
32
|
+
if graphql_object.has_key?("articleBody")
|
33
|
+
# Let's just parse the images first
|
34
|
+
images = graphql_object["image"].map do |image|
|
35
|
+
Zorki.retrieve_media(image["url"])
|
36
|
+
end
|
37
|
+
|
38
|
+
text = graphql_object["articleBody"]
|
39
|
+
username = graphql_object["author"]["identifier"]["value"]
|
40
|
+
# 2021-04-01T17:07:10-07:00
|
41
|
+
date = DateTime.strptime(graphql_object["dateCreated"], "%Y-%m-%dT%H:%M:%S%z")
|
42
|
+
interactions = graphql_object["interactionStatistic"]
|
43
|
+
number_of_likes = interactions.select do |x|
|
44
|
+
x["interactionType"] == "http://schema.org/LikeAction"
|
45
|
+
end.first["userInteractionCount"]
|
46
|
+
|
47
|
+
unless graphql_object["video"].empty?
|
48
|
+
video = graphql_object["video"].first["contentUrl"]
|
49
|
+
video_preview_image = graphql_object["video"].first["thumbnailUrl"]
|
50
|
+
end
|
51
|
+
else
|
52
|
+
# We need to see if this is a single image post or a slideshow. We do that
|
53
|
+
# by looking for a single image, if it's not there, we assume the alternative.
|
54
|
+
graphql_object = graphql_object["data"]["xdt_api__v1__media__shortcode__web_info"]
|
55
|
+
|
56
|
+
|
57
|
+
unless graphql_object["items"][0].has_key?("video_versions") && !graphql_object["items"][0]["video_versions"].nil?
|
58
|
+
# Check if there is a slideshow or not
|
59
|
+
unless graphql_object["items"][0].has_key?("carousel_media") && !graphql_object["items"][0]["carousel_media"].nil?
|
60
|
+
# Single image
|
61
|
+
image_url = graphql_object["items"][0]["image_versions2"]["candidates"][0]["url"]
|
62
|
+
images = [Zorki.retrieve_media(image_url)]
|
63
|
+
else
|
64
|
+
# Slideshow
|
65
|
+
images = graphql_object["items"][0]["carousel_media"].map do |media|
|
66
|
+
Zorki.retrieve_media(media["image_versions2"]["candidates"][0]["url"])
|
67
|
+
end
|
68
|
+
end
|
69
|
+
else
|
70
|
+
# some of these I've seen in both ways, thus the commented out lines
|
71
|
+
# video_url = graphql_object["entry_data"]["PostPage"].first["graphql"]["shortcode_media"]["video_url"]
|
72
|
+
video_url = graphql_object["items"][0]["video_versions"][0]["url"]
|
73
|
+
video = Zorki.retrieve_media(video_url)
|
74
|
+
# video_preview_image_url = graphql_object["entry_data"]["PostPage"].first["graphql"]["shortcode_media"]["display_resources"].last["src"]
|
75
|
+
video_preview_image_url = graphql_object["items"][0]["image_versions2"]["candidates"][0]["url"]
|
76
|
+
video_preview_image = Zorki.retrieve_media(video_preview_image_url)
|
77
|
+
end
|
78
|
+
|
79
|
+
unless graphql_object["items"][0]["caption"].nil?
|
80
|
+
text = graphql_object["items"][0]["caption"]["text"]
|
81
|
+
else
|
82
|
+
text = ""
|
83
|
+
end
|
84
|
+
|
85
|
+
username = graphql_object["items"][0]["user"]["username"]
|
86
|
+
|
87
|
+
date = DateTime.strptime(graphql_object["items"][0]["taken_at"].to_s, "%s")
|
88
|
+
number_of_likes = graphql_object["items"][0]["like_count"]
|
89
|
+
end
|
90
|
+
|
91
|
+
screenshot_file = take_screenshot()
|
92
|
+
|
93
|
+
# This has to run last since it switches pages
|
94
|
+
user = User.lookup([username]).first
|
95
|
+
page.quit
|
96
|
+
|
97
|
+
{
|
98
|
+
images: images,
|
99
|
+
video: video,
|
100
|
+
video_preview_image: video_preview_image,
|
101
|
+
screenshot_file: screenshot_file,
|
102
|
+
text: text,
|
103
|
+
date: date,
|
104
|
+
number_of_likes: number_of_likes,
|
105
|
+
user: user,
|
106
|
+
id: id
|
107
|
+
}
|
108
|
+
end
|
109
|
+
|
110
|
+
def take_screenshot
|
111
|
+
# First check if a post has a fact check overlay, if so, clear it.
|
112
|
+
# The only issue is that this can take *awhile* to search. Not sure what to do about that
|
113
|
+
# since it's Instagram's fault for having such a fucked up obfuscated hierarchy
|
114
|
+
begin
|
115
|
+
find_button("See Post").click
|
116
|
+
sleep(0.1)
|
117
|
+
rescue Capybara::ElementNotFound
|
118
|
+
# Do nothing if the element is not found
|
119
|
+
end
|
120
|
+
|
121
|
+
# Take the screenshot and return it
|
122
|
+
save_screenshot("#{Zorki.temp_storage_location}/instagram_screenshot_#{SecureRandom.uuid}.png")
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
@@ -0,0 +1,227 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "capybara/dsl"
|
4
|
+
require "dotenv/load"
|
5
|
+
require "oj"
|
6
|
+
require "selenium-webdriver"
|
7
|
+
require "logger"
|
8
|
+
require "debug"
|
9
|
+
require "securerandom"
|
10
|
+
|
11
|
+
# 2022-06-07 14:15:23 WARN Selenium [DEPRECATION] [:browser_options] :options as a parameter for driver initialization is deprecated. Use :capabilities with an Array of value capabilities/options if necessary instead.
|
12
|
+
|
13
|
+
options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
|
14
|
+
options.add_argument("--start-maximized")
|
15
|
+
options.add_argument("--no-sandbox")
|
16
|
+
options.add_argument("--disable-dev-shm-usage")
|
17
|
+
options.add_argument("–-disable-blink-features=AutomationControlled")
|
18
|
+
options.add_argument("--disable-extensions")
|
19
|
+
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
|
20
|
+
options.add_preference "password_manager_enabled", false
|
21
|
+
options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
|
22
|
+
|
23
|
+
Capybara.register_driver :selenium_zorki do |app|
|
24
|
+
client = Selenium::WebDriver::Remote::Http::Default.new
|
25
|
+
client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
26
|
+
Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
|
27
|
+
end
|
28
|
+
|
29
|
+
Capybara.threadsafe = true
|
30
|
+
Capybara.default_max_wait_time = 60
|
31
|
+
Capybara.reuse_server = true
|
32
|
+
|
33
|
+
module Zorki
|
34
|
+
class Scraper # rubocop:disable Metrics/ClassLength
|
35
|
+
include Capybara::DSL
|
36
|
+
|
37
|
+
@@logger = Logger.new(STDOUT)
|
38
|
+
@@logger.level = Logger::WARN
|
39
|
+
@@logger.datetime_format = "%Y-%m-%d %H:%M:%S"
|
40
|
+
@@session_id = nil
|
41
|
+
|
42
|
+
def initialize
|
43
|
+
Capybara.default_driver = :selenium_zorki
|
44
|
+
end
|
45
|
+
|
46
|
+
# Instagram uses GraphQL (like most of Facebook I think), and returns an object that actually
|
47
|
+
# is used to seed the page. We can just parse this for most things.
|
48
|
+
#
|
49
|
+
# additional_search_params is a comma seperated keys
|
50
|
+
# example: `data,xdt_api__v1__media__shortcode__web_info,items`
|
51
|
+
#
|
52
|
+
# @returns Hash a ruby hash of the JSON data
|
53
|
+
def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil)
|
54
|
+
# So this is fun:
|
55
|
+
# For pages marked as misinformation we have to use one method (interception of requrest) and
|
56
|
+
# for pages that are not, we can just pull the data straight from the page.
|
57
|
+
#
|
58
|
+
# How do we figure out which is which?... for now we'll just run through both and see where we
|
59
|
+
# go with it.
|
60
|
+
|
61
|
+
# Our user data no longer lives in the graphql object passed initially with the page.
|
62
|
+
# Instead it comes in as part of a subsequent call. This intercepts all calls, checks if it's
|
63
|
+
# the one we want, and then moves on.
|
64
|
+
response_body = nil
|
65
|
+
|
66
|
+
page.driver.browser.intercept do |request, &continue|
|
67
|
+
# This passes the request forward unmodified, since we only care about the response
|
68
|
+
# puts "checking request: #{request.url}"
|
69
|
+
|
70
|
+
continue.call(request) && next unless request.url.include?(subpage_search)
|
71
|
+
|
72
|
+
continue.call(request) do |response|
|
73
|
+
# Check if not a CORS prefetch and finish up if not
|
74
|
+
if response.body.present?
|
75
|
+
check_passed = true
|
76
|
+
|
77
|
+
unless additional_search_parameters.nil?
|
78
|
+
body_to_check = Oj.load(response.body)
|
79
|
+
|
80
|
+
search_parameters = additional_search_parameters.split(",")
|
81
|
+
search_parameters.each_with_index do |key, index|
|
82
|
+
break if body_to_check.nil?
|
83
|
+
|
84
|
+
check_passed = false unless body_to_check.has_key?(key)
|
85
|
+
body_to_check = body_to_check[key]
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
response_body = response.body if check_passed == true
|
90
|
+
end
|
91
|
+
end
|
92
|
+
rescue Selenium::WebDriver::Error::WebDriverError
|
93
|
+
# Eat them
|
94
|
+
end
|
95
|
+
|
96
|
+
# Now that the intercept is set up, we visit the page we want
|
97
|
+
visit(url)
|
98
|
+
# We wait until the correct intercept is processed or we've waited 60 seconds
|
99
|
+
start_time = Time.now
|
100
|
+
# puts "Waiting.... #{url}"
|
101
|
+
|
102
|
+
sleep(rand(1...10))
|
103
|
+
while response_body.nil? && (Time.now - start_time) < 60
|
104
|
+
sleep(0.1)
|
105
|
+
end
|
106
|
+
|
107
|
+
page.driver.execute_script("window.stop();")
|
108
|
+
|
109
|
+
# If this is a page that has not been marked as misinfo we can just pull the data
|
110
|
+
# TODO: put this before the whole load loop
|
111
|
+
if response_body.nil?
|
112
|
+
doc = Nokogiri::HTML(page.driver.browser.page_source)
|
113
|
+
elements = doc.search("script").find_all do |e|
|
114
|
+
e.attributes.has_key?("type") && e.attributes["type"].value == "application/ld+json"
|
115
|
+
end
|
116
|
+
|
117
|
+
raise ContentUnavailableError if elements&.empty?
|
118
|
+
return Oj.load(elements.first.text)
|
119
|
+
end
|
120
|
+
|
121
|
+
raise ContentUnavailableError if response_body.nil?
|
122
|
+
Oj.load(response_body)
|
123
|
+
end
|
124
|
+
|
125
|
+
private
|
126
|
+
|
127
|
+
##########
|
128
|
+
# Set the session to use a new user folder in the options!
|
129
|
+
# #####################
|
130
|
+
def reset_selenium
|
131
|
+
options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
|
132
|
+
options.add_argument("--start-maximized")
|
133
|
+
options.add_argument("--no-sandbox")
|
134
|
+
options.add_argument("--disable-dev-shm-usage")
|
135
|
+
options.add_argument("–-disable-blink-features=AutomationControlled")
|
136
|
+
options.add_argument("--disable-extensions")
|
137
|
+
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
|
138
|
+
options.add_preference "password_manager_enabled", false
|
139
|
+
options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
|
140
|
+
# options.add_argument("--user-data-dir=/tmp/tarun")
|
141
|
+
|
142
|
+
Capybara.register_driver :selenium do |app|
|
143
|
+
client = Selenium::WebDriver::Remote::Http::Default.new
|
144
|
+
client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
145
|
+
Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
|
146
|
+
end
|
147
|
+
|
148
|
+
Capybara.current_driver = :selenium
|
149
|
+
end
|
150
|
+
|
151
|
+
def login
|
152
|
+
# Reset the sessions so that there's nothing laying around
|
153
|
+
page.quit
|
154
|
+
|
155
|
+
# Check if we're on a Instagram page already, if not visit it.
|
156
|
+
visit ("https://instagram.com") unless page.driver.browser.current_url.include? "instagram.com"
|
157
|
+
|
158
|
+
# We don't have to login if we already are
|
159
|
+
begin
|
160
|
+
return if find_field("Search", wait: 10).present?
|
161
|
+
rescue Capybara::ElementNotFound; end
|
162
|
+
|
163
|
+
# Check if we're redirected to a login page, if we aren't we're already logged in
|
164
|
+
return unless page.has_xpath?('//*[@id="loginForm"]/div/div[3]/button')
|
165
|
+
|
166
|
+
# Try to log in
|
167
|
+
loop_count = 0
|
168
|
+
while loop_count < 5 do
|
169
|
+
fill_in("username", with: ENV["INSTAGRAM_USER_NAME"])
|
170
|
+
fill_in("password", with: ENV["INSTAGRAM_PASSWORD"])
|
171
|
+
|
172
|
+
begin
|
173
|
+
click_button("Log in", exact_text: true) # Note: "Log in" (lowercase `in`) instead redirects to Facebook's login page
|
174
|
+
rescue Capybara::ElementNotFound; end # If we can't find it don't break horribly, just keep waiting
|
175
|
+
|
176
|
+
break unless has_css?('p[data-testid="login-error-message"', wait: 10)
|
177
|
+
loop_count += 1
|
178
|
+
sleep(rand * 10.3)
|
179
|
+
end
|
180
|
+
|
181
|
+
# Sometimes Instagram just... doesn't let you log in
|
182
|
+
raise "Instagram not accessible" if loop_count == 5
|
183
|
+
|
184
|
+
# No we don't want to save our login credentials
|
185
|
+
begin
|
186
|
+
click_on("Save Info")
|
187
|
+
rescue Capybara::ElementNotFound; end
|
188
|
+
end
|
189
|
+
|
190
|
+
def fetch_image(url)
|
191
|
+
request = Typhoeus::Request.new(url, followlocation: true)
|
192
|
+
request.on_complete do |response|
|
193
|
+
if request.success?
|
194
|
+
return request.body
|
195
|
+
elsif request.timed_out?
|
196
|
+
raise Zorki::Error("Fetching image at #{url} timed out")
|
197
|
+
else
|
198
|
+
raise Zorki::Error("Fetching image at #{url} returned non-successful HTTP server response #{request.code}")
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
# Convert a string to an integer
|
204
|
+
def number_string_to_integer(number_string)
|
205
|
+
# First we have to remove any commas in the number or else it all breaks
|
206
|
+
number_string = number_string.delete(",")
|
207
|
+
# Is the last digit not a number? If so, we're going to have to multiply it by some multiplier
|
208
|
+
should_expand = /[0-9]/.match(number_string[-1, 1]).nil?
|
209
|
+
|
210
|
+
# Get the last index and remove the letter at the end if we should expand
|
211
|
+
last_index = should_expand ? number_string.length - 1 : number_string.length
|
212
|
+
number = number_string[0, last_index].to_f
|
213
|
+
multiplier = 1
|
214
|
+
# Determine the multiplier depending on the letter indicated
|
215
|
+
case number_string[-1, 1]
|
216
|
+
when "m"
|
217
|
+
multiplier = 1_000_000
|
218
|
+
end
|
219
|
+
|
220
|
+
# Multiply everything and insure we get an integer back
|
221
|
+
(number * multiplier).to_i
|
222
|
+
end
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
require_relative "post_scraper"
|
227
|
+
require_relative "user_scraper"
|
@@ -0,0 +1,74 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "typhoeus"
|
4
|
+
|
5
|
+
module Zorki
|
6
|
+
class UserScraper < Scraper
|
7
|
+
def parse(username)
|
8
|
+
# Stuff we need to get from the DOM (implemented is starred):
|
9
|
+
# - *Name
|
10
|
+
# - *Username
|
11
|
+
# - *No. of posts
|
12
|
+
# - *Verified
|
13
|
+
# - *No. of followers
|
14
|
+
# - *No. of people they follow
|
15
|
+
# - *Profile
|
16
|
+
# - *description
|
17
|
+
# - *links
|
18
|
+
# - *Profile image
|
19
|
+
login
|
20
|
+
|
21
|
+
graphql_script = get_content_of_subpage_from_url("https://instagram.com/#{username}/", "?username=")
|
22
|
+
|
23
|
+
if graphql_script.has_key?("author") && !graphql_script["author"].nil?
|
24
|
+
user = graphql_script["author"]
|
25
|
+
|
26
|
+
# Get the username (to verify we're on the right page here)
|
27
|
+
scraped_username = user["identifier"]["value"]
|
28
|
+
raise Zorki::Error unless username == scraped_username
|
29
|
+
|
30
|
+
number_of_posts = graphql_script["interactionStatistic"].select do |stat|
|
31
|
+
stat["interactionType"] == "https://schema.org/FilmAction"
|
32
|
+
end.first
|
33
|
+
|
34
|
+
number_of_followers = graphql_script["interactionStatistic"].select do |stat|
|
35
|
+
stat["interactionType"] == "http://schema.org/FollowAction"
|
36
|
+
end.first
|
37
|
+
|
38
|
+
profile_image_url = user["image"]
|
39
|
+
{
|
40
|
+
name: user["name"],
|
41
|
+
username: username,
|
42
|
+
number_of_posts: Integer(number_of_posts["userInteractionCount"]),
|
43
|
+
number_of_followers: Integer(number_of_followers["userInteractionCount"]),
|
44
|
+
# number_of_following: user["edge_follow"]["count"],
|
45
|
+
verified: user["is_verified"], # todo
|
46
|
+
profile: graphql_script["description"],
|
47
|
+
profile_link: user["sameAs"],
|
48
|
+
profile_image: Zorki.retrieve_media(profile_image_url),
|
49
|
+
profile_image_url: profile_image_url
|
50
|
+
}
|
51
|
+
else
|
52
|
+
user = graphql_script["data"]["user"]
|
53
|
+
|
54
|
+
# Get the username (to verify we're on the right page here)
|
55
|
+
scraped_username = user["username"]
|
56
|
+
raise Zorki::Error unless username == scraped_username
|
57
|
+
|
58
|
+
profile_image_url = user["profile_pic_url_hd"]
|
59
|
+
{
|
60
|
+
name: user["full_name"],
|
61
|
+
username: username,
|
62
|
+
number_of_posts: user["edge_owner_to_timeline_media"]["count"],
|
63
|
+
number_of_followers: user["edge_followed_by"]["count"],
|
64
|
+
number_of_following: user["edge_follow"]["count"],
|
65
|
+
verified: user["is_verified"],
|
66
|
+
profile: user["biography"],
|
67
|
+
profile_link: user["external_url"],
|
68
|
+
profile_image: Zorki.retrieve_media(profile_image_url),
|
69
|
+
profile_image_url: profile_image_url
|
70
|
+
}
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
data/lib/zorki/user.rb
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Zorki
|
4
|
+
class User
|
5
|
+
def self.lookup(usernames = [])
|
6
|
+
# If a single id is passed in we make it the appropriate array
|
7
|
+
usernames = [usernames] unless usernames.kind_of?(Array)
|
8
|
+
|
9
|
+
# Check that the usernames are at least real usernames
|
10
|
+
# usernames.each { |id| raise Birdsong::Error if !/\A\d+\z/.match(id) }
|
11
|
+
|
12
|
+
self.scrape(usernames)
|
13
|
+
end
|
14
|
+
|
15
|
+
attr_reader :name,
|
16
|
+
:username,
|
17
|
+
:number_of_posts,
|
18
|
+
:number_of_followers,
|
19
|
+
:number_of_following,
|
20
|
+
:verified,
|
21
|
+
:profile,
|
22
|
+
:profile_link,
|
23
|
+
:profile_image,
|
24
|
+
:profile_image_url
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
def initialize(user_hash = {})
|
29
|
+
@name = user_hash[:name]
|
30
|
+
@username = user_hash[:username]
|
31
|
+
@number_of_posts = user_hash[:number_of_posts]
|
32
|
+
@number_of_followers = user_hash[:number_of_followers]
|
33
|
+
@number_of_following = user_hash[:number_of_following]
|
34
|
+
@verified = user_hash[:verified]
|
35
|
+
@profile = user_hash[:profile]
|
36
|
+
@profile_link = user_hash[:profile_link]
|
37
|
+
@profile_image = user_hash[:profile_image]
|
38
|
+
@profile_image_url = user_hash[:profile_image_url]
|
39
|
+
end
|
40
|
+
|
41
|
+
class << self
|
42
|
+
private
|
43
|
+
|
44
|
+
def scrape(usernames)
|
45
|
+
usernames.map do |username|
|
46
|
+
user_hash = Zorki::UserScraper.new.parse(username)
|
47
|
+
User.new(user_hash)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
data/lib/zorki.rb
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "zorki/version"
|
4
|
+
require_relative "zorki/monkeypatch"
|
5
|
+
|
6
|
+
# Representative objects we create
|
7
|
+
require_relative "zorki/user"
|
8
|
+
require_relative "zorki/post"
|
9
|
+
|
10
|
+
require "helpers/configuration"
|
11
|
+
require_relative "zorki/scrapers/scraper"
|
12
|
+
|
13
|
+
module Zorki
|
14
|
+
extend Configuration
|
15
|
+
|
16
|
+
class Error < StandardError
|
17
|
+
def initialize(msg = "Zorki encountered an error scraping Instagram")
|
18
|
+
super
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
class ContentUnavailableError < Error
|
23
|
+
def initialize(msg = "Zorki could not find content requested")
|
24
|
+
super
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
class RetryableError < Error; end
|
29
|
+
|
30
|
+
class ImageRequestTimedOutError < RetryableError
|
31
|
+
def initialize(msg = "Zorki encountered a timeout error requesting an image")
|
32
|
+
super
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
class ImageRequestFailedError < RetryableError
|
37
|
+
def initialize(msg = "Zorki received a non-200 response requesting an image")
|
38
|
+
super
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
define_setting :temp_storage_location, "tmp/zorki"
|
43
|
+
|
44
|
+
# Get an image from a URL and save to a temp folder set in the configuration under
|
45
|
+
# temp_storage_location
|
46
|
+
def self.retrieve_media(url)
|
47
|
+
response = Typhoeus.get(url)
|
48
|
+
|
49
|
+
# Get the file extension if it's in the file
|
50
|
+
stripped_url = url.split("?").first # remove URL query params
|
51
|
+
extension = stripped_url.split(".").last
|
52
|
+
|
53
|
+
# Do some basic checks so we just empty out if there's something weird in the file extension
|
54
|
+
# that could do some harm.
|
55
|
+
if extension.length.positive?
|
56
|
+
extension = nil unless /^[a-zA-Z0-9]+$/.match?(extension)
|
57
|
+
extension = ".#{extension}" unless extension.nil?
|
58
|
+
end
|
59
|
+
|
60
|
+
temp_file_name = "#{Zorki.temp_storage_location}/instagram_media_#{SecureRandom.uuid}#{extension}"
|
61
|
+
|
62
|
+
# We do this in case the folder isn't created yet, since it's a temp folder we'll just do so
|
63
|
+
self.create_temp_storage_location
|
64
|
+
File.binwrite(temp_file_name, response.body)
|
65
|
+
temp_file_name
|
66
|
+
end
|
67
|
+
|
68
|
+
private
|
69
|
+
|
70
|
+
def self.create_temp_storage_location
|
71
|
+
return if File.exist?(Zorki.temp_storage_location) && File.directory?(Zorki.temp_storage_location)
|
72
|
+
FileUtils.mkdir_p Zorki.temp_storage_location
|
73
|
+
end
|
74
|
+
end
|
data/zorki.gemspec
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "lib/zorki/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = "zorki"
|
7
|
+
spec.version = Zorki::VERSION
|
8
|
+
spec.authors = ["Christopher Guess"]
|
9
|
+
spec.email = ["cguess@gmail.com"]
|
10
|
+
|
11
|
+
spec.summary = "A gem to scrape Instagram pages for archive purposes."
|
12
|
+
# spec.description = "TODO: Write a longer description or delete this line."
|
13
|
+
# spec.homepage = "TODO: Put your gem's website or public repo URL here."
|
14
|
+
spec.license = "MIT"
|
15
|
+
spec.required_ruby_version = Gem::Requirement.new(">= 2.7.0")
|
16
|
+
|
17
|
+
# spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
|
18
|
+
|
19
|
+
# spec.metadata["homepage_uri"] = spec.homepage
|
20
|
+
# spec.metadata["source_code_uri"] = "TODO: Put your gem's public repo URL here."
|
21
|
+
# spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
|
22
|
+
|
23
|
+
# Specify which files should be added to the gem when it is released.
|
24
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
25
|
+
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
26
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
|
27
|
+
end
|
28
|
+
spec.bindir = "exe"
|
29
|
+
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
30
|
+
spec.require_paths = ["lib"]
|
31
|
+
|
32
|
+
# Uncomment to register a new dependency of your gem
|
33
|
+
# spec.add_dependency "example-gem", "~> 1.0"
|
34
|
+
spec.add_dependency "capybara" # For scraping and running browsers
|
35
|
+
spec.add_dependency "apparition" # A Chrome driver for Capybara
|
36
|
+
spec.add_dependency "typhoeus" # For making API requests
|
37
|
+
spec.add_dependency "oj" # A faster JSON parser/loader than stdlib
|
38
|
+
spec.add_dependency "selenium-webdriver" # Webdriver selenium
|
39
|
+
spec.add_dependency "selenium-devtools" # Allow us to intercept requests
|
40
|
+
|
41
|
+
# For more information and examples about making a new gem, checkout our
|
42
|
+
# guide at: https://bundler.io/guides/creating_gem.html
|
43
|
+
end
|