mosquito-scrape 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/workflows/main.yml +22 -0
- data/.gitignore +10 -0
- data/.rubocop.yml +67 -0
- data/CHANGELOG.md +9 -0
- data/CODE_OF_CONDUCT.md +84 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +146 -0
- data/LICENSE.txt +21 -0
- data/README.md +43 -0
- data/Rakefile +16 -0
- data/bin/console +15 -0
- data/bin/setup +8 -0
- data/lib/generators/mosquito.rb +3 -0
- data/lib/generators/mosquito_generator.rb +6 -0
- data/lib/helpers/configuration.rb +28 -0
- data/lib/mosquito/monkeypatch.rb +52 -0
- data/lib/mosquito/scrapers/scraper.rb +227 -0
- data/lib/mosquito/scrapers/tweet_scraper.rb +95 -0
- data/lib/mosquito/scrapers/user_scraper.rb +79 -0
- data/lib/mosquito/tweet.rb +52 -0
- data/lib/mosquito/user.rb +57 -0
- data/lib/mosquito/version.rb +5 -0
- data/lib/mosquito.rb +82 -0
- data/mosquito.gemspec +58 -0
- metadata +347 -0
@@ -0,0 +1,227 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "capybara/dsl"
|
4
|
+
require "dotenv/load"
|
5
|
+
require "oj"
|
6
|
+
require "selenium-webdriver"
|
7
|
+
require "logger"
|
8
|
+
require "securerandom"
|
9
|
+
require "selenium/webdriver/remote/http/curb"
|
10
|
+
require "debug"
|
11
|
+
require "typhoeus"
|
12
|
+
|
13
|
+
# 2022-06-07 14:15:23 WARN Selenium [DEPRECATION] [:browser_options] :options as a parameter for driver initialization is deprecated. Use :capabilities with an Array of value capabilities/options if necessary instead.
|
14
|
+
|
15
|
+
options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
|
16
|
+
options.add_argument("--start-maximized")
|
17
|
+
options.add_argument("--no-sandbox")
|
18
|
+
options.add_argument("--disable-dev-shm-usage")
|
19
|
+
options.add_argument("–-disable-blink-features=AutomationControlled")
|
20
|
+
options.add_argument("--disable-extensions")
|
21
|
+
options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
|
22
|
+
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
|
23
|
+
options.add_preference "password_manager_enabled", false
|
24
|
+
options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
|
25
|
+
|
26
|
+
Capybara.register_driver :selenium_mosquito do |app|
|
27
|
+
client = Selenium::WebDriver::Remote::Http::Curb.new
|
28
|
+
# client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
29
|
+
Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
|
30
|
+
end
|
31
|
+
|
32
|
+
Capybara.threadsafe = true
|
33
|
+
Capybara.default_max_wait_time = 60
|
34
|
+
Capybara.reuse_server = true
|
35
|
+
|
36
|
+
module Mosquito
|
37
|
+
class Scraper # rubocop:disable Metrics/ClassLength
|
38
|
+
include Capybara::DSL
|
39
|
+
|
40
|
+
@@logger = Logger.new(STDOUT)
|
41
|
+
@@logger.level = Logger::WARN
|
42
|
+
@@logger.datetime_format = "%Y-%m-%d %H:%M:%S"
|
43
|
+
@@session_id = nil
|
44
|
+
|
45
|
+
def initialize
|
46
|
+
Capybara.default_driver = :selenium_mosquito
|
47
|
+
end
|
48
|
+
|
49
|
+
def get_content_of_page_from_url_curl(url)
|
50
|
+
Typhoeus.get(url, followlocation: true)
|
51
|
+
end
|
52
|
+
|
53
|
+
# Instagram uses GraphQL (like most of Facebook I think), and returns an object that actually
|
54
|
+
# is used to seed the page. We can just parse this for most things.
|
55
|
+
#
|
56
|
+
# additional_search_params is a comma seperated keys
|
57
|
+
# example: `data,xdt_api__v1__media__shortcode__web_info,items`
|
58
|
+
#
|
59
|
+
# @returns Hash a ruby hash of the JSON data
|
60
|
+
def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil)
|
61
|
+
# So this is fun:
|
62
|
+
# For pages marked as misinformation we have to use one method (interception of requrest) and
|
63
|
+
# for pages that are not, we can just pull the data straight from the page.
|
64
|
+
#
|
65
|
+
# How do we figure out which is which?... for now we'll just run through both and see where we
|
66
|
+
# go with it.
|
67
|
+
|
68
|
+
# Our user data no longer lives in the graphql object passed initially with the page.
|
69
|
+
# Instead it comes in as part of a subsequent call. This intercepts all calls, checks if it's
|
70
|
+
# the one we want, and then moves on.
|
71
|
+
response_body = nil
|
72
|
+
|
73
|
+
page.driver.browser.intercept do |request, &continue|
|
74
|
+
# This passes the request forward unmodified, since we only care about the response
|
75
|
+
# puts "checking request: #{request.url}"
|
76
|
+
|
77
|
+
continue.call(request) && next unless request.url.include?(subpage_search)
|
78
|
+
|
79
|
+
|
80
|
+
continue.call(request) do |response|
|
81
|
+
# Check if not a CORS prefetch and finish up if not
|
82
|
+
if !response.body.empty? && response.body
|
83
|
+
check_passed = true
|
84
|
+
unless additional_search_parameters.nil?
|
85
|
+
body_to_check = Oj.load(response.body)
|
86
|
+
|
87
|
+
search_parameters = additional_search_parameters.split(",")
|
88
|
+
search_parameters.each_with_index do |key, index|
|
89
|
+
break if body_to_check.nil?
|
90
|
+
|
91
|
+
check_passed = false unless body_to_check.has_key?(key)
|
92
|
+
body_to_check = body_to_check[key]
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
response_body = response.body if check_passed == true
|
97
|
+
end
|
98
|
+
end
|
99
|
+
rescue Selenium::WebDriver::Error::WebDriverError
|
100
|
+
# Eat them
|
101
|
+
end
|
102
|
+
|
103
|
+
# Now that the intercept is set up, we visit the page we want
|
104
|
+
page.driver.browser.navigate.to(url)
|
105
|
+
# We wait until the correct intercept is processed or we've waited 60 seconds
|
106
|
+
start_time = Time.now
|
107
|
+
# puts "Waiting.... #{url}"
|
108
|
+
|
109
|
+
sleep(rand(1...10))
|
110
|
+
while response_body.nil? && (Time.now - start_time) < 60
|
111
|
+
sleep(0.1)
|
112
|
+
end
|
113
|
+
|
114
|
+
page.driver.execute_script("window.stop();")
|
115
|
+
raise Mosquito::NoTweetFoundError if response_body.nil?
|
116
|
+
Oj.load(response_body)
|
117
|
+
end
|
118
|
+
|
119
|
+
private
|
120
|
+
|
121
|
+
##########
|
122
|
+
# Set the session to use a new user folder in the options!
|
123
|
+
# #####################
|
124
|
+
def reset_selenium
|
125
|
+
options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
|
126
|
+
options.add_argument("--start-maximized")
|
127
|
+
options.add_argument("--no-sandbox")
|
128
|
+
options.add_argument("--disable-dev-shm-usage")
|
129
|
+
options.add_argument("–-disable-blink-features=AutomationControlled")
|
130
|
+
options.add_argument("--disable-extensions")
|
131
|
+
options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
|
132
|
+
|
133
|
+
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
|
134
|
+
options.add_preference "password_manager_enabled", false
|
135
|
+
options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
|
136
|
+
# options.add_argument("--user-data-dir=/tmp/tarun")
|
137
|
+
|
138
|
+
Capybara.register_driver :selenium do |app|
|
139
|
+
client = Selenium::WebDriver::Remote::Http::Curb.new
|
140
|
+
# client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
141
|
+
Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
|
142
|
+
end
|
143
|
+
|
144
|
+
Capybara.current_driver = :selenium
|
145
|
+
end
|
146
|
+
|
147
|
+
def login
|
148
|
+
# Reset the sessions so that there's nothing laying around
|
149
|
+
page.quit
|
150
|
+
|
151
|
+
# Check if we're on a Instagram page already, if not visit it.
|
152
|
+
unless page.driver.browser.current_url.include? "instagram.com"
|
153
|
+
# There seems to be a bug in the Linux ARM64 version of chromedriver where this will properly
|
154
|
+
# navigate but then timeout, crashing it all up. So instead we check and raise the error when
|
155
|
+
# that then fails again.
|
156
|
+
page.driver.browser.navigate.to("https://instagram.com")
|
157
|
+
end
|
158
|
+
|
159
|
+
# We don't have to login if we already are
|
160
|
+
begin
|
161
|
+
return if find_field("Search", wait: 10).present?
|
162
|
+
rescue Capybara::ElementNotFound; end
|
163
|
+
|
164
|
+
# Check if we're redirected to a login page, if we aren't we're already logged in
|
165
|
+
return unless page.has_xpath?('//*[@id="loginForm"]/div/div[3]/button')
|
166
|
+
|
167
|
+
# Try to log in
|
168
|
+
loop_count = 0
|
169
|
+
while loop_count < 5 do
|
170
|
+
fill_in("username", with: ENV["INSTAGRAM_USER_NAME"])
|
171
|
+
fill_in("password", with: ENV["INSTAGRAM_PASSWORD"])
|
172
|
+
|
173
|
+
begin
|
174
|
+
click_button("Log in", exact_text: true) # Note: "Log in" (lowercase `in`) instead redirects to Facebook's login page
|
175
|
+
rescue Capybara::ElementNotFound; end # If we can't find it don't break horribly, just keep waiting
|
176
|
+
|
177
|
+
break unless has_css?('p[data-testid="login-error-message"', wait: 10)
|
178
|
+
loop_count += 1
|
179
|
+
sleep(rand * 10.3)
|
180
|
+
end
|
181
|
+
|
182
|
+
# Sometimes Instagram just... doesn't let you log in
|
183
|
+
raise "Instagram not accessible" if loop_count == 5
|
184
|
+
|
185
|
+
# No we don't want to save our login credentials
|
186
|
+
begin
|
187
|
+
click_on("Save Info")
|
188
|
+
rescue Capybara::ElementNotFound; end
|
189
|
+
end
|
190
|
+
|
191
|
+
def fetch_image(url)
|
192
|
+
request = Typhoeus::Request.new(url, followlocation: true)
|
193
|
+
request.on_complete do |response|
|
194
|
+
if request.success?
|
195
|
+
return request.body
|
196
|
+
elsif request.timed_out?
|
197
|
+
raise Zorki::Error("Fetching image at #{url} timed out")
|
198
|
+
else
|
199
|
+
raise Zorki::Error("Fetching image at #{url} returned non-successful HTTP server response #{request.code}")
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
# Convert a string to an integer
|
205
|
+
def number_string_to_integer(number_string)
|
206
|
+
# First we have to remove any commas in the number or else it all breaks
|
207
|
+
number_string = number_string.delete(",")
|
208
|
+
# Is the last digit not a number? If so, we're going to have to multiply it by some multiplier
|
209
|
+
should_expand = /[0-9]/.match(number_string[-1, 1]).nil?
|
210
|
+
|
211
|
+
# Get the last index and remove the letter at the end if we should expand
|
212
|
+
last_index = should_expand ? number_string.length - 1 : number_string.length
|
213
|
+
number = number_string[0, last_index].to_f
|
214
|
+
multiplier = 1
|
215
|
+
# Determine the multiplier depending on the letter indicated
|
216
|
+
case number_string[-1, 1]
|
217
|
+
when "m"
|
218
|
+
multiplier = 1_000_000
|
219
|
+
end
|
220
|
+
|
221
|
+
# Multiply everything and insure we get an integer back
|
222
|
+
(number * multiplier).to_i
|
223
|
+
end
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
# require_relative "tweet_scraper"
|
@@ -0,0 +1,95 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "typhoeus"
|
4
|
+
require_relative "scraper"
|
5
|
+
require "nokogiri"
|
6
|
+
require "open-uri"
|
7
|
+
|
8
|
+
module Mosquito
|
9
|
+
class TweetScraper < Scraper
|
10
|
+
def parse(id)
|
11
|
+
# Stuff we need to get from the DOM (implemented is starred):
|
12
|
+
# - User
|
13
|
+
# - Text *
|
14
|
+
# - Image / Images / Video *
|
15
|
+
# - Date *
|
16
|
+
# - Number of likes *
|
17
|
+
# - Hashtags
|
18
|
+
|
19
|
+
Capybara.app_host = ENV["NITTER_URL"]
|
20
|
+
|
21
|
+
# video slideshows https://www.instagram.com/p/CY7KxwYOFBS/?utm_source=ig_embed&utm_campaign=loading
|
22
|
+
# login
|
23
|
+
begin
|
24
|
+
doc = Nokogiri::HTML(URI.open("#{ENV["NITTER_URL"]}/jack/status/#{id}"))
|
25
|
+
rescue OpenURI::HTTPError
|
26
|
+
raise Mosquito::NoTweetFoundError
|
27
|
+
end
|
28
|
+
|
29
|
+
unless doc.xpath("//div[contains(@class, 'error-panel')]").empty?
|
30
|
+
raise Mosquito::NoTweetFoundError
|
31
|
+
end
|
32
|
+
|
33
|
+
text = doc.xpath("//div[contains(@class, 'tweet-content media-body')]").first.content
|
34
|
+
date = DateTime.parse(doc.xpath("//span[contains(@class, 'tweet-date')]").first.child["title"])
|
35
|
+
id = URI.parse(doc.xpath("//link[contains(@rel, 'canonical')]").first["href"]).path.split("/").last
|
36
|
+
number_of_likes = doc.xpath("//span[contains(@class, 'tweet-stat')][last()]/div").first.content.delete(",").to_i
|
37
|
+
language = "en" # We can't determine this anymore with Nitter, but english will be fine, we don't actually use this anywhere... i think
|
38
|
+
# user
|
39
|
+
|
40
|
+
images = []
|
41
|
+
videos = []
|
42
|
+
video_preview_image = nil
|
43
|
+
video_file_type = nil
|
44
|
+
|
45
|
+
# Single image
|
46
|
+
images.concat(doc.xpath("//a[contains(@class, 'still-image')][1]/href"))
|
47
|
+
|
48
|
+
# Slideshow
|
49
|
+
nodes = doc.xpath("//div[contains(@class, 'main-tweet')]/div/div/div[contains(@class, 'attachments')]/div[contains(@class, 'gallery-row')]/div/a/@href")
|
50
|
+
images.concat(nodes.map { |node| "#{Capybara.app_host}#{node.value}" })
|
51
|
+
|
52
|
+
# Video
|
53
|
+
nodes = doc.xpath("//div[contains(@class, 'main-tweet')]/div/div/div[contains(@class, 'attachments')]/div[contains(@class, 'gallery-video')]/div/video")
|
54
|
+
unless nodes.empty?
|
55
|
+
video_preview_image = nodes.first["poster"]
|
56
|
+
videos.concat(nodes.map { |node| "#{Capybara.app_host}#{node.xpath("//source").first["src"]}" })
|
57
|
+
video_file_type = "video" # This is always video now, sing a gif isn't displayed differently
|
58
|
+
end
|
59
|
+
|
60
|
+
# GIF
|
61
|
+
nodes = doc.xpath("//div[contains(@class, 'main-tweet')]/div/div/div[contains(@class, 'attachments')]/div[contains(@class, 'gallery-gif')]/div/video")
|
62
|
+
unless nodes.empty?
|
63
|
+
video_preview_image = nodes.first["poster"]
|
64
|
+
videos.concat(nodes.map { |node| "#{Capybara.app_host}#{node.xpath("//source[1]/source/@src").first&.content}" })
|
65
|
+
video_file_type = "gif"
|
66
|
+
end
|
67
|
+
|
68
|
+
username = doc.xpath("//a[contains(@class, 'username')][1]/@href").first.value
|
69
|
+
user = UserScraper.new.parse(username)
|
70
|
+
|
71
|
+
screenshot_file = take_screenshot()
|
72
|
+
|
73
|
+
{
|
74
|
+
images: images,
|
75
|
+
video: videos,
|
76
|
+
video_preview_image: video_preview_image,
|
77
|
+
screenshot_file: screenshot_file,
|
78
|
+
text: text,
|
79
|
+
date: date,
|
80
|
+
number_of_likes: number_of_likes,
|
81
|
+
user: user,
|
82
|
+
id: id,
|
83
|
+
language: language,
|
84
|
+
video_file_type: video_file_type
|
85
|
+
}
|
86
|
+
end
|
87
|
+
|
88
|
+
def take_screenshot
|
89
|
+
# First check if a post has a fact check overlay, if so, clear it.
|
90
|
+
# The only issue is that this can take *awhile* to search. Not sure what to do about that
|
91
|
+
# since it's Instagram's fault for having such a fucked up obfuscated hierarchy # Take the screenshot and return it
|
92
|
+
save_screenshot("#{Mosquito.temp_storage_location}/instagram_screenshot_#{SecureRandom.uuid}.png")
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "typhoeus"
|
4
|
+
require_relative "scraper"
|
5
|
+
require "nokogiri"
|
6
|
+
require "open-uri"
|
7
|
+
|
8
|
+
module Mosquito
|
9
|
+
class UserScraper < Scraper
|
10
|
+
def parse(username)
|
11
|
+
# Stuff we need to get from the DOM (implemented is starred):
|
12
|
+
# id
|
13
|
+
# name
|
14
|
+
# username
|
15
|
+
# sign_up_date
|
16
|
+
# location
|
17
|
+
# profile_image_url
|
18
|
+
# description
|
19
|
+
# followers_count
|
20
|
+
# following_count
|
21
|
+
# tweet_count
|
22
|
+
# listed_count
|
23
|
+
# verified
|
24
|
+
# url
|
25
|
+
# profile_image_file_name
|
26
|
+
|
27
|
+
Capybara.app_host = ENV["NITTER_URL"]
|
28
|
+
|
29
|
+
username = username.delete("/")
|
30
|
+
|
31
|
+
doc = Nokogiri::HTML(URI.open("#{ENV["NITTER_URL"]}/#{username}"))
|
32
|
+
|
33
|
+
unless doc.xpath("//div[contains(@class, 'error-panel')]").empty?
|
34
|
+
raise Mosquito::NoTweetFoundError
|
35
|
+
end
|
36
|
+
|
37
|
+
id = username
|
38
|
+
full_name = doc.xpath("//a[contains(@class, 'profile-card-fullname')]/@title").first&.value
|
39
|
+
username = username
|
40
|
+
sign_up_date = DateTime.parse(doc.xpath("//div[contains(@class, 'profile-joindate')]/span/@title").first&.value)
|
41
|
+
location = doc.xpath("//div[contains(@class, 'profile-location')]/span[last()]").first&.content
|
42
|
+
profile_image_url = "#{Capybara.app_host}#{doc.xpath("//a[contains(@class, 'profile-card-avatar')]/@href").first&.value}"
|
43
|
+
description = doc.xpath("//div[contains(@class, 'profile-bio')]/p").first&.content
|
44
|
+
followers_count = doc.xpath("//li[contains(@class, 'followers')]/span[contains(@class, 'profile-stat-num')]").first&.content&.delete(",").to_i
|
45
|
+
following_count = doc.xpath("//li[contains(@class, 'following')]/span[contains(@class, 'profile-stat-num')]").first&.content&.delete(",").to_i
|
46
|
+
tweet_count = doc.xpath("//li[contains(@class, 'posts')]/span[contains(@class, 'profile-stat-num')]").first&.content&.delete(",").to_i
|
47
|
+
listed_count = 0 # We can't get this from nitter, and it's not a big deal
|
48
|
+
verified = !doc.xpath("//a[contains(@class, 'profile-card-fullname')]/div/span[contains(@title, 'Verified account')]").empty?
|
49
|
+
url = doc.xpath("//div[contains(@class, 'profile-website')]/span[last()]/a/@href").first&.content
|
50
|
+
profile_image_file_name = Mosquito.retrieve_media(profile_image_url)
|
51
|
+
|
52
|
+
user = {
|
53
|
+
id: id,
|
54
|
+
name: full_name,
|
55
|
+
username: username,
|
56
|
+
sign_up_date: sign_up_date,
|
57
|
+
location: location,
|
58
|
+
profile_image_url: profile_image_url,
|
59
|
+
description: description,
|
60
|
+
followers_count: followers_count,
|
61
|
+
following_count: following_count,
|
62
|
+
tweet_count: tweet_count,
|
63
|
+
listed_count: listed_count,
|
64
|
+
verified: verified,
|
65
|
+
url: url,
|
66
|
+
profile_image_file_name: profile_image_file_name
|
67
|
+
}
|
68
|
+
|
69
|
+
user
|
70
|
+
end
|
71
|
+
|
72
|
+
def take_screenshot
|
73
|
+
# First check if a post has a fact check overlay, if so, clear it.
|
74
|
+
# The only issue is that this can take *awhile* to search. Not sure what to do about that
|
75
|
+
# since it's Instagram's fault for having such a fucked up obfuscated hierarchy # Take the screenshot and return it
|
76
|
+
save_screenshot("#{Mosquito.temp_storage_location}/instagram_screenshot_#{SecureRandom.uuid}.png")
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Mosquito
|
4
|
+
class Tweet
|
5
|
+
def self.lookup(ids = [])
|
6
|
+
# If a single id is passed in we make it the appropriate array
|
7
|
+
ids = [ids] unless ids.kind_of?(Array)
|
8
|
+
|
9
|
+
# Check that the ids are at least real ids
|
10
|
+
ids.each { |id| raise Mosquito::InvalidIdError if !/\A\d+\z/.match(id) }
|
11
|
+
|
12
|
+
tweet_objects = ids.map { |id| Mosquito::TweetScraper.new.parse(id) }
|
13
|
+
|
14
|
+
tweet_objects.map do |tweet_object|
|
15
|
+
Tweet.new(tweet_object)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
# Attributes for after the response is parsed from Twitter
|
20
|
+
attr_reader :json
|
21
|
+
attr_reader :id
|
22
|
+
attr_reader :created_at
|
23
|
+
attr_reader :text
|
24
|
+
attr_reader :language
|
25
|
+
attr_reader :author_id
|
26
|
+
attr_reader :author
|
27
|
+
attr_reader :image_file_names
|
28
|
+
attr_reader :video_file_names
|
29
|
+
attr_reader :video_file_type
|
30
|
+
attr_reader :video_preview_image
|
31
|
+
|
32
|
+
alias_method :user, :author # Every other gem uses `user` so we can just alias it
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def initialize(tweet_object)
|
37
|
+
@id = tweet_object[:id]
|
38
|
+
@created_at = tweet_object[:date]
|
39
|
+
@text = tweet_object[:text]
|
40
|
+
@language = tweet_object[:language]
|
41
|
+
@author_id = tweet_object[:user][:id]
|
42
|
+
|
43
|
+
@image_file_names = tweet_object[:images]
|
44
|
+
@video_file_names = tweet_object[:video]
|
45
|
+
@video_file_type = tweet_object[:video_file_type]
|
46
|
+
@video_preview_image = tweet_object[:video_preview_image]
|
47
|
+
# Look up the author given the new id.
|
48
|
+
# NOTE: This doesn't *seem* like the right place for this, but I"m not sure where else
|
49
|
+
@author = User.new(tweet_object[:user])
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Mosquito
|
4
|
+
class User
|
5
|
+
# Attributes for after the response is parsed from Twitter
|
6
|
+
attr_reader :json
|
7
|
+
attr_reader :id
|
8
|
+
attr_reader :name
|
9
|
+
attr_reader :username
|
10
|
+
attr_reader :sign_up_date
|
11
|
+
attr_reader :location
|
12
|
+
attr_reader :profile_image_url
|
13
|
+
attr_reader :description
|
14
|
+
attr_reader :url
|
15
|
+
attr_reader :followers_count
|
16
|
+
attr_reader :following_count
|
17
|
+
attr_reader :tweet_count
|
18
|
+
attr_reader :listed_count
|
19
|
+
attr_reader :verified
|
20
|
+
attr_reader :created_at
|
21
|
+
attr_reader :profile_image_file_name
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def self.lookup(ids = [])
|
26
|
+
# If a single id is passed in we make it the appropriate array
|
27
|
+
ids = [ids] unless ids.kind_of?(Array)
|
28
|
+
|
29
|
+
# Check that the ids are at least real ids
|
30
|
+
user_objects = ids.map { |id| Mosquito::UserScraper.new.parse(id) }
|
31
|
+
|
32
|
+
user_objects.map do |user_object|
|
33
|
+
User.new(user_object)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def initialize(user_object)
|
38
|
+
@id = user_object[:id]
|
39
|
+
@name = user_object[:name]
|
40
|
+
@username = user_object[:username]
|
41
|
+
@created_at = user_object[:sign_up_date]
|
42
|
+
@location = user_object[:location]
|
43
|
+
|
44
|
+
# Removing the "normal" here gets us the full-sized image, instead of the 150x150 thumbnail
|
45
|
+
@profile_image_url = user_object[:profile_image_url]
|
46
|
+
|
47
|
+
@description = user_object[:description]
|
48
|
+
@url = user_object[:url]
|
49
|
+
@followers_count = user_object[:followers_count]
|
50
|
+
@following_count = user_object[:following_count]
|
51
|
+
@tweet_count = user_object[:tweet_count]
|
52
|
+
@listed_count = user_object[:listed_count]
|
53
|
+
@verified = user_object[:verified] # this will always be `false` but we're keeping it here for compatibility
|
54
|
+
@profile_image_file_name = user_object[:profile_image_file_name]
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
data/lib/mosquito.rb
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "json"
|
4
|
+
require "typhoeus"
|
5
|
+
require "date"
|
6
|
+
require "securerandom"
|
7
|
+
require "helpers/configuration"
|
8
|
+
require "fileutils"
|
9
|
+
|
10
|
+
require_relative "mosquito/version"
|
11
|
+
require_relative "mosquito/tweet"
|
12
|
+
require_relative "mosquito/user"
|
13
|
+
require_relative "mosquito/scrapers/scraper"
|
14
|
+
require_relative "mosquito/scrapers/tweet_scraper"
|
15
|
+
require_relative "mosquito/scrapers/user_scraper"
|
16
|
+
|
17
|
+
require_relative "mosquito/monkeypatch"
|
18
|
+
|
19
|
+
module Mosquito
|
20
|
+
extend Configuration
|
21
|
+
|
22
|
+
class Error < StandardError; end
|
23
|
+
class AuthorizationError < Error; end
|
24
|
+
class InvalidIdError < Error; end
|
25
|
+
class InvalidMediaTypeError < Error; end
|
26
|
+
class NoTweetFoundError < Error; end
|
27
|
+
class RateLimitExceeded < Error
|
28
|
+
attr_reader :rate_limit
|
29
|
+
attr_reader :rate_remaining
|
30
|
+
attr_reader :reset_time_left
|
31
|
+
|
32
|
+
def initialize(rate_limit, rate_remaining, reset_time)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
define_setting :temp_storage_location, "tmp/mosquito"
|
37
|
+
define_setting :nitter_url, ENV["NITTER_URL"]
|
38
|
+
define_setting :save_media, true
|
39
|
+
|
40
|
+
# The general fields to always return for Users
|
41
|
+
def self.user_fields
|
42
|
+
"name,created_at,location,profile_image_url,protected,public_metrics,url,username,verified,withheld,description"
|
43
|
+
end
|
44
|
+
|
45
|
+
# The general fields to always return for Tweets
|
46
|
+
def self.tweet_fields
|
47
|
+
"attachments,author_id,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang"
|
48
|
+
end
|
49
|
+
|
50
|
+
# Get media from a URL and save to a temp folder set in the configuration under
|
51
|
+
# temp_storage_location
|
52
|
+
def self.retrieve_media(url)
|
53
|
+
return "" if !Mosquito.save_media
|
54
|
+
|
55
|
+
response = Typhoeus.get(url)
|
56
|
+
|
57
|
+
# Get the file extension if it's in the file
|
58
|
+
extension = url.split(".").last
|
59
|
+
|
60
|
+
# Do some basic checks so we just empty out if there's something weird in the file extension
|
61
|
+
# that could do some harm.
|
62
|
+
if extension.length.positive?
|
63
|
+
extension = extension[0...extension.index("?")]
|
64
|
+
extension = nil unless /^[a-zA-Z0-9]+$/.match?(extension)
|
65
|
+
extension = ".#{extension}" unless extension.nil?
|
66
|
+
end
|
67
|
+
|
68
|
+
temp_file_name = "#{Mosquito.temp_storage_location}/#{SecureRandom.uuid}#{extension}"
|
69
|
+
|
70
|
+
# We do this in case the folder isn't created yet, since it's a temp folder we'll just do so
|
71
|
+
self.create_temp_storage_location
|
72
|
+
File.binwrite(temp_file_name, response.body)
|
73
|
+
temp_file_name
|
74
|
+
end
|
75
|
+
|
76
|
+
private
|
77
|
+
|
78
|
+
def self.create_temp_storage_location
|
79
|
+
return if File.exist?(Mosquito.temp_storage_location) && File.directory?(Mosquito.temp_storage_location)
|
80
|
+
FileUtils.mkdir_p Mosquito.temp_storage_location
|
81
|
+
end
|
82
|
+
end
|
data/mosquito.gemspec
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "lib/mosquito/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = "mosquito-scrape"
|
7
|
+
spec.version = Mosquito::VERSION
|
8
|
+
spec.authors = ["Christopher Guess"]
|
9
|
+
spec.email = ["cguess@gmail.com"]
|
10
|
+
|
11
|
+
spec.summary = "A gem to scrape a Nitter instance for Twitter data"
|
12
|
+
# spec.description = "TODO: Write a longer description or delete this line."
|
13
|
+
# spec.homepage = "TODO: Put your gem's website or public repo URL here."
|
14
|
+
spec.license = "MIT"
|
15
|
+
spec.required_ruby_version = Gem::Requirement.new(">= 2.7.0")
|
16
|
+
|
17
|
+
# spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
|
18
|
+
|
19
|
+
# spec.metadata["homepage_uri"] = spec.homepage
|
20
|
+
# spec.metadata["source_code_uri"] = "TODO: Put your gem's public repo URL here."
|
21
|
+
# spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
|
22
|
+
|
23
|
+
# Specify which files should be added to the gem when it is released.
|
24
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
25
|
+
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
26
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
|
27
|
+
end
|
28
|
+
spec.bindir = "exe"
|
29
|
+
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
30
|
+
spec.require_paths = ["lib"]
|
31
|
+
|
32
|
+
# Prod dependencies
|
33
|
+
spec.add_dependency "typhoeus", "~> 1.4"
|
34
|
+
spec.add_dependency "nokogiri", "~> 1.15.5"
|
35
|
+
spec.add_dependency "capybara", "~> 3.39"
|
36
|
+
spec.add_dependency "dotenv", "~> 2.8"
|
37
|
+
spec.add_dependency "oj", "~> 3.16"
|
38
|
+
spec.add_dependency "fileutils", "~> 1.7"
|
39
|
+
spec.add_dependency "logger", "~> 1.6"
|
40
|
+
spec.add_dependency "securerandom", "~> 0.3"
|
41
|
+
spec.add_dependency "selenium-webdriver", "~> 4"
|
42
|
+
spec.add_dependency "open-uri", "~> 0.4"
|
43
|
+
spec.add_dependency "activesupport", "~> 7.0.8"
|
44
|
+
spec.add_dependency "rack", "~> 2"
|
45
|
+
|
46
|
+
# Dev dependencies
|
47
|
+
spec.add_development_dependency "byebug", "~> 11.0"
|
48
|
+
spec.add_development_dependency "rake", "~> 13.0"
|
49
|
+
spec.add_development_dependency "minitest", "~> 5"
|
50
|
+
spec.add_development_dependency "rubocop", "~> 1.0"
|
51
|
+
spec.add_development_dependency "rubocop-rails", "~> 2.0"
|
52
|
+
spec.add_development_dependency "rubocop-rails_config", "~> 1.0"
|
53
|
+
spec.add_development_dependency "rubocop-performance", "~> 1.0"
|
54
|
+
spec.add_development_dependency "dotenv", "~> 2.0"
|
55
|
+
|
56
|
+
# For more information and examples about making a new gem, checkout our
|
57
|
+
# guide at: https://bundler.io/guides/creating_gem.html
|
58
|
+
end
|