mosquito-scrape 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,227 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "capybara/dsl"
4
+ require "dotenv/load"
5
+ require "oj"
6
+ require "selenium-webdriver"
7
+ require "logger"
8
+ require "securerandom"
9
+ require "selenium/webdriver/remote/http/curb"
10
+ require "debug"
11
+ require "typhoeus"
12
+
13
+ # 2022-06-07 14:15:23 WARN Selenium [DEPRECATION] [:browser_options] :options as a parameter for driver initialization is deprecated. Use :capabilities with an Array of value capabilities/options if necessary instead.
14
+
15
+ options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
16
+ options.add_argument("--start-maximized")
17
+ options.add_argument("--no-sandbox")
18
+ options.add_argument("--disable-dev-shm-usage")
19
+ options.add_argument("–-disable-blink-features=AutomationControlled")
20
+ options.add_argument("--disable-extensions")
21
+ options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
22
+ options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
23
+ options.add_preference "password_manager_enabled", false
24
+ options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
25
+
26
+ Capybara.register_driver :selenium_mosquito do |app|
27
+ client = Selenium::WebDriver::Remote::Http::Curb.new
28
+ # client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
29
+ Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
30
+ end
31
+
32
+ Capybara.threadsafe = true
33
+ Capybara.default_max_wait_time = 60
34
+ Capybara.reuse_server = true
35
+
36
+ module Mosquito
37
+ class Scraper # rubocop:disable Metrics/ClassLength
38
+ include Capybara::DSL
39
+
40
+ @@logger = Logger.new(STDOUT)
41
+ @@logger.level = Logger::WARN
42
+ @@logger.datetime_format = "%Y-%m-%d %H:%M:%S"
43
+ @@session_id = nil
44
+
45
+ def initialize
46
+ Capybara.default_driver = :selenium_mosquito
47
+ end
48
+
49
+ def get_content_of_page_from_url_curl(url)
50
+ Typhoeus.get(url, followlocation: true)
51
+ end
52
+
53
+ # Instagram uses GraphQL (like most of Facebook I think), and returns an object that actually
54
+ # is used to seed the page. We can just parse this for most things.
55
+ #
56
+ # additional_search_params is a comma seperated keys
57
+ # example: `data,xdt_api__v1__media__shortcode__web_info,items`
58
+ #
59
+ # @returns Hash a ruby hash of the JSON data
60
+ def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil)
61
+ # So this is fun:
62
+ # For pages marked as misinformation we have to use one method (interception of requrest) and
63
+ # for pages that are not, we can just pull the data straight from the page.
64
+ #
65
+ # How do we figure out which is which?... for now we'll just run through both and see where we
66
+ # go with it.
67
+
68
+ # Our user data no longer lives in the graphql object passed initially with the page.
69
+ # Instead it comes in as part of a subsequent call. This intercepts all calls, checks if it's
70
+ # the one we want, and then moves on.
71
+ response_body = nil
72
+
73
+ page.driver.browser.intercept do |request, &continue|
74
+ # This passes the request forward unmodified, since we only care about the response
75
+ # puts "checking request: #{request.url}"
76
+
77
+ continue.call(request) && next unless request.url.include?(subpage_search)
78
+
79
+
80
+ continue.call(request) do |response|
81
+ # Check if not a CORS prefetch and finish up if not
82
+ if !response.body.empty? && response.body
83
+ check_passed = true
84
+ unless additional_search_parameters.nil?
85
+ body_to_check = Oj.load(response.body)
86
+
87
+ search_parameters = additional_search_parameters.split(",")
88
+ search_parameters.each_with_index do |key, index|
89
+ break if body_to_check.nil?
90
+
91
+ check_passed = false unless body_to_check.has_key?(key)
92
+ body_to_check = body_to_check[key]
93
+ end
94
+ end
95
+
96
+ response_body = response.body if check_passed == true
97
+ end
98
+ end
99
+ rescue Selenium::WebDriver::Error::WebDriverError
100
+ # Eat them
101
+ end
102
+
103
+ # Now that the intercept is set up, we visit the page we want
104
+ page.driver.browser.navigate.to(url)
105
+ # We wait until the correct intercept is processed or we've waited 60 seconds
106
+ start_time = Time.now
107
+ # puts "Waiting.... #{url}"
108
+
109
+ sleep(rand(1...10))
110
+ while response_body.nil? && (Time.now - start_time) < 60
111
+ sleep(0.1)
112
+ end
113
+
114
+ page.driver.execute_script("window.stop();")
115
+ raise Mosquito::NoTweetFoundError if response_body.nil?
116
+ Oj.load(response_body)
117
+ end
118
+
119
+ private
120
+
121
+ ##########
122
+ # Set the session to use a new user folder in the options!
123
+ # #####################
124
+ def reset_selenium
125
+ options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
126
+ options.add_argument("--start-maximized")
127
+ options.add_argument("--no-sandbox")
128
+ options.add_argument("--disable-dev-shm-usage")
129
+ options.add_argument("–-disable-blink-features=AutomationControlled")
130
+ options.add_argument("--disable-extensions")
131
+ options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
132
+
133
+ options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
134
+ options.add_preference "password_manager_enabled", false
135
+ options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
136
+ # options.add_argument("--user-data-dir=/tmp/tarun")
137
+
138
+ Capybara.register_driver :selenium do |app|
139
+ client = Selenium::WebDriver::Remote::Http::Curb.new
140
+ # client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
141
+ Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
142
+ end
143
+
144
+ Capybara.current_driver = :selenium
145
+ end
146
+
147
+ def login
148
+ # Reset the sessions so that there's nothing laying around
149
+ page.quit
150
+
151
+ # Check if we're on a Instagram page already, if not visit it.
152
+ unless page.driver.browser.current_url.include? "instagram.com"
153
+ # There seems to be a bug in the Linux ARM64 version of chromedriver where this will properly
154
+ # navigate but then timeout, crashing it all up. So instead we check and raise the error when
155
+ # that then fails again.
156
+ page.driver.browser.navigate.to("https://instagram.com")
157
+ end
158
+
159
+ # We don't have to login if we already are
160
+ begin
161
+ return if find_field("Search", wait: 10).present?
162
+ rescue Capybara::ElementNotFound; end
163
+
164
+ # Check if we're redirected to a login page, if we aren't we're already logged in
165
+ return unless page.has_xpath?('//*[@id="loginForm"]/div/div[3]/button')
166
+
167
+ # Try to log in
168
+ loop_count = 0
169
+ while loop_count < 5 do
170
+ fill_in("username", with: ENV["INSTAGRAM_USER_NAME"])
171
+ fill_in("password", with: ENV["INSTAGRAM_PASSWORD"])
172
+
173
+ begin
174
+ click_button("Log in", exact_text: true) # Note: "Log in" (lowercase `in`) instead redirects to Facebook's login page
175
+ rescue Capybara::ElementNotFound; end # If we can't find it don't break horribly, just keep waiting
176
+
177
+ break unless has_css?('p[data-testid="login-error-message"', wait: 10)
178
+ loop_count += 1
179
+ sleep(rand * 10.3)
180
+ end
181
+
182
+ # Sometimes Instagram just... doesn't let you log in
183
+ raise "Instagram not accessible" if loop_count == 5
184
+
185
+ # No we don't want to save our login credentials
186
+ begin
187
+ click_on("Save Info")
188
+ rescue Capybara::ElementNotFound; end
189
+ end
190
+
191
+ def fetch_image(url)
192
+ request = Typhoeus::Request.new(url, followlocation: true)
193
+ request.on_complete do |response|
194
+ if request.success?
195
+ return request.body
196
+ elsif request.timed_out?
197
+ raise Zorki::Error("Fetching image at #{url} timed out")
198
+ else
199
+ raise Zorki::Error("Fetching image at #{url} returned non-successful HTTP server response #{request.code}")
200
+ end
201
+ end
202
+ end
203
+
204
+ # Convert a string to an integer
205
+ def number_string_to_integer(number_string)
206
+ # First we have to remove any commas in the number or else it all breaks
207
+ number_string = number_string.delete(",")
208
+ # Is the last digit not a number? If so, we're going to have to multiply it by some multiplier
209
+ should_expand = /[0-9]/.match(number_string[-1, 1]).nil?
210
+
211
+ # Get the last index and remove the letter at the end if we should expand
212
+ last_index = should_expand ? number_string.length - 1 : number_string.length
213
+ number = number_string[0, last_index].to_f
214
+ multiplier = 1
215
+ # Determine the multiplier depending on the letter indicated
216
+ case number_string[-1, 1]
217
+ when "m"
218
+ multiplier = 1_000_000
219
+ end
220
+
221
+ # Multiply everything and insure we get an integer back
222
+ (number * multiplier).to_i
223
+ end
224
+ end
225
+ end
226
+
227
+ # require_relative "tweet_scraper"
@@ -0,0 +1,95 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "typhoeus"
4
+ require_relative "scraper"
5
+ require "nokogiri"
6
+ require "open-uri"
7
+
8
+ module Mosquito
9
+ class TweetScraper < Scraper
10
+ def parse(id)
11
+ # Stuff we need to get from the DOM (implemented is starred):
12
+ # - User
13
+ # - Text *
14
+ # - Image / Images / Video *
15
+ # - Date *
16
+ # - Number of likes *
17
+ # - Hashtags
18
+
19
+ Capybara.app_host = ENV["NITTER_URL"]
20
+
21
+ # video slideshows https://www.instagram.com/p/CY7KxwYOFBS/?utm_source=ig_embed&utm_campaign=loading
22
+ # login
23
+ begin
24
+ doc = Nokogiri::HTML(URI.open("#{ENV["NITTER_URL"]}/jack/status/#{id}"))
25
+ rescue OpenURI::HTTPError
26
+ raise Mosquito::NoTweetFoundError
27
+ end
28
+
29
+ unless doc.xpath("//div[contains(@class, 'error-panel')]").empty?
30
+ raise Mosquito::NoTweetFoundError
31
+ end
32
+
33
+ text = doc.xpath("//div[contains(@class, 'tweet-content media-body')]").first.content
34
+ date = DateTime.parse(doc.xpath("//span[contains(@class, 'tweet-date')]").first.child["title"])
35
+ id = URI.parse(doc.xpath("//link[contains(@rel, 'canonical')]").first["href"]).path.split("/").last
36
+ number_of_likes = doc.xpath("//span[contains(@class, 'tweet-stat')][last()]/div").first.content.delete(",").to_i
37
+ language = "en" # We can't determine this anymore with Nitter, but english will be fine, we don't actually use this anywhere... i think
38
+ # user
39
+
40
+ images = []
41
+ videos = []
42
+ video_preview_image = nil
43
+ video_file_type = nil
44
+
45
+ # Single image
46
+ images.concat(doc.xpath("//a[contains(@class, 'still-image')][1]/href"))
47
+
48
+ # Slideshow
49
+ nodes = doc.xpath("//div[contains(@class, 'main-tweet')]/div/div/div[contains(@class, 'attachments')]/div[contains(@class, 'gallery-row')]/div/a/@href")
50
+ images.concat(nodes.map { |node| "#{Capybara.app_host}#{node.value}" })
51
+
52
+ # Video
53
+ nodes = doc.xpath("//div[contains(@class, 'main-tweet')]/div/div/div[contains(@class, 'attachments')]/div[contains(@class, 'gallery-video')]/div/video")
54
+ unless nodes.empty?
55
+ video_preview_image = nodes.first["poster"]
56
+ videos.concat(nodes.map { |node| "#{Capybara.app_host}#{node.xpath("//source").first["src"]}" })
57
+ video_file_type = "video" # This is always video now, sing a gif isn't displayed differently
58
+ end
59
+
60
+ # GIF
61
+ nodes = doc.xpath("//div[contains(@class, 'main-tweet')]/div/div/div[contains(@class, 'attachments')]/div[contains(@class, 'gallery-gif')]/div/video")
62
+ unless nodes.empty?
63
+ video_preview_image = nodes.first["poster"]
64
+ videos.concat(nodes.map { |node| "#{Capybara.app_host}#{node.xpath("//source[1]/source/@src").first&.content}" })
65
+ video_file_type = "gif"
66
+ end
67
+
68
+ username = doc.xpath("//a[contains(@class, 'username')][1]/@href").first.value
69
+ user = UserScraper.new.parse(username)
70
+
71
+ screenshot_file = take_screenshot()
72
+
73
+ {
74
+ images: images,
75
+ video: videos,
76
+ video_preview_image: video_preview_image,
77
+ screenshot_file: screenshot_file,
78
+ text: text,
79
+ date: date,
80
+ number_of_likes: number_of_likes,
81
+ user: user,
82
+ id: id,
83
+ language: language,
84
+ video_file_type: video_file_type
85
+ }
86
+ end
87
+
88
+ def take_screenshot
89
+ # First check if a post has a fact check overlay, if so, clear it.
90
+ # The only issue is that this can take *awhile* to search. Not sure what to do about that
91
+ # since it's Instagram's fault for having such a fucked up obfuscated hierarchy # Take the screenshot and return it
92
+ save_screenshot("#{Mosquito.temp_storage_location}/instagram_screenshot_#{SecureRandom.uuid}.png")
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,79 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "typhoeus"
4
+ require_relative "scraper"
5
+ require "nokogiri"
6
+ require "open-uri"
7
+
8
+ module Mosquito
9
+ class UserScraper < Scraper
10
+ def parse(username)
11
+ # Stuff we need to get from the DOM (implemented is starred):
12
+ # id
13
+ # name
14
+ # username
15
+ # sign_up_date
16
+ # location
17
+ # profile_image_url
18
+ # description
19
+ # followers_count
20
+ # following_count
21
+ # tweet_count
22
+ # listed_count
23
+ # verified
24
+ # url
25
+ # profile_image_file_name
26
+
27
+ Capybara.app_host = ENV["NITTER_URL"]
28
+
29
+ username = username.delete("/")
30
+
31
+ doc = Nokogiri::HTML(URI.open("#{ENV["NITTER_URL"]}/#{username}"))
32
+
33
+ unless doc.xpath("//div[contains(@class, 'error-panel')]").empty?
34
+ raise Mosquito::NoTweetFoundError
35
+ end
36
+
37
+ id = username
38
+ full_name = doc.xpath("//a[contains(@class, 'profile-card-fullname')]/@title").first&.value
39
+ username = username
40
+ sign_up_date = DateTime.parse(doc.xpath("//div[contains(@class, 'profile-joindate')]/span/@title").first&.value)
41
+ location = doc.xpath("//div[contains(@class, 'profile-location')]/span[last()]").first&.content
42
+ profile_image_url = "#{Capybara.app_host}#{doc.xpath("//a[contains(@class, 'profile-card-avatar')]/@href").first&.value}"
43
+ description = doc.xpath("//div[contains(@class, 'profile-bio')]/p").first&.content
44
+ followers_count = doc.xpath("//li[contains(@class, 'followers')]/span[contains(@class, 'profile-stat-num')]").first&.content&.delete(",").to_i
45
+ following_count = doc.xpath("//li[contains(@class, 'following')]/span[contains(@class, 'profile-stat-num')]").first&.content&.delete(",").to_i
46
+ tweet_count = doc.xpath("//li[contains(@class, 'posts')]/span[contains(@class, 'profile-stat-num')]").first&.content&.delete(",").to_i
47
+ listed_count = 0 # We can't get this from nitter, and it's not a big deal
48
+ verified = !doc.xpath("//a[contains(@class, 'profile-card-fullname')]/div/span[contains(@title, 'Verified account')]").empty?
49
+ url = doc.xpath("//div[contains(@class, 'profile-website')]/span[last()]/a/@href").first&.content
50
+ profile_image_file_name = Mosquito.retrieve_media(profile_image_url)
51
+
52
+ user = {
53
+ id: id,
54
+ name: full_name,
55
+ username: username,
56
+ sign_up_date: sign_up_date,
57
+ location: location,
58
+ profile_image_url: profile_image_url,
59
+ description: description,
60
+ followers_count: followers_count,
61
+ following_count: following_count,
62
+ tweet_count: tweet_count,
63
+ listed_count: listed_count,
64
+ verified: verified,
65
+ url: url,
66
+ profile_image_file_name: profile_image_file_name
67
+ }
68
+
69
+ user
70
+ end
71
+
72
+ def take_screenshot
73
+ # First check if a post has a fact check overlay, if so, clear it.
74
+ # The only issue is that this can take *awhile* to search. Not sure what to do about that
75
+ # since it's Instagram's fault for having such a fucked up obfuscated hierarchy # Take the screenshot and return it
76
+ save_screenshot("#{Mosquito.temp_storage_location}/instagram_screenshot_#{SecureRandom.uuid}.png")
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,52 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mosquito
4
+ class Tweet
5
+ def self.lookup(ids = [])
6
+ # If a single id is passed in we make it the appropriate array
7
+ ids = [ids] unless ids.kind_of?(Array)
8
+
9
+ # Check that the ids are at least real ids
10
+ ids.each { |id| raise Mosquito::InvalidIdError if !/\A\d+\z/.match(id) }
11
+
12
+ tweet_objects = ids.map { |id| Mosquito::TweetScraper.new.parse(id) }
13
+
14
+ tweet_objects.map do |tweet_object|
15
+ Tweet.new(tweet_object)
16
+ end
17
+ end
18
+
19
+ # Attributes for after the response is parsed from Twitter
20
+ attr_reader :json
21
+ attr_reader :id
22
+ attr_reader :created_at
23
+ attr_reader :text
24
+ attr_reader :language
25
+ attr_reader :author_id
26
+ attr_reader :author
27
+ attr_reader :image_file_names
28
+ attr_reader :video_file_names
29
+ attr_reader :video_file_type
30
+ attr_reader :video_preview_image
31
+
32
+ alias_method :user, :author # Every other gem uses `user` so we can just alias it
33
+
34
+ private
35
+
36
+ def initialize(tweet_object)
37
+ @id = tweet_object[:id]
38
+ @created_at = tweet_object[:date]
39
+ @text = tweet_object[:text]
40
+ @language = tweet_object[:language]
41
+ @author_id = tweet_object[:user][:id]
42
+
43
+ @image_file_names = tweet_object[:images]
44
+ @video_file_names = tweet_object[:video]
45
+ @video_file_type = tweet_object[:video_file_type]
46
+ @video_preview_image = tweet_object[:video_preview_image]
47
+ # Look up the author given the new id.
48
+ # NOTE: This doesn't *seem* like the right place for this, but I"m not sure where else
49
+ @author = User.new(tweet_object[:user])
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,57 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mosquito
4
+ class User
5
+ # Attributes for after the response is parsed from Twitter
6
+ attr_reader :json
7
+ attr_reader :id
8
+ attr_reader :name
9
+ attr_reader :username
10
+ attr_reader :sign_up_date
11
+ attr_reader :location
12
+ attr_reader :profile_image_url
13
+ attr_reader :description
14
+ attr_reader :url
15
+ attr_reader :followers_count
16
+ attr_reader :following_count
17
+ attr_reader :tweet_count
18
+ attr_reader :listed_count
19
+ attr_reader :verified
20
+ attr_reader :created_at
21
+ attr_reader :profile_image_file_name
22
+
23
+ private
24
+
25
+ def self.lookup(ids = [])
26
+ # If a single id is passed in we make it the appropriate array
27
+ ids = [ids] unless ids.kind_of?(Array)
28
+
29
+ # Check that the ids are at least real ids
30
+ user_objects = ids.map { |id| Mosquito::UserScraper.new.parse(id) }
31
+
32
+ user_objects.map do |user_object|
33
+ User.new(user_object)
34
+ end
35
+ end
36
+
37
+ def initialize(user_object)
38
+ @id = user_object[:id]
39
+ @name = user_object[:name]
40
+ @username = user_object[:username]
41
+ @created_at = user_object[:sign_up_date]
42
+ @location = user_object[:location]
43
+
44
+ # Removing the "normal" here gets us the full-sized image, instead of the 150x150 thumbnail
45
+ @profile_image_url = user_object[:profile_image_url]
46
+
47
+ @description = user_object[:description]
48
+ @url = user_object[:url]
49
+ @followers_count = user_object[:followers_count]
50
+ @following_count = user_object[:following_count]
51
+ @tweet_count = user_object[:tweet_count]
52
+ @listed_count = user_object[:listed_count]
53
+ @verified = user_object[:verified] # this will always be `false` but we're keeping it here for compatibility
54
+ @profile_image_file_name = user_object[:profile_image_file_name]
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mosquito
4
+ VERSION = "0.1.0"
5
+ end
data/lib/mosquito.rb ADDED
@@ -0,0 +1,82 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require "typhoeus"
5
+ require "date"
6
+ require "securerandom"
7
+ require "helpers/configuration"
8
+ require "fileutils"
9
+
10
+ require_relative "mosquito/version"
11
+ require_relative "mosquito/tweet"
12
+ require_relative "mosquito/user"
13
+ require_relative "mosquito/scrapers/scraper"
14
+ require_relative "mosquito/scrapers/tweet_scraper"
15
+ require_relative "mosquito/scrapers/user_scraper"
16
+
17
+ require_relative "mosquito/monkeypatch"
18
+
19
+ module Mosquito
20
+ extend Configuration
21
+
22
+ class Error < StandardError; end
23
+ class AuthorizationError < Error; end
24
+ class InvalidIdError < Error; end
25
+ class InvalidMediaTypeError < Error; end
26
+ class NoTweetFoundError < Error; end
27
+ class RateLimitExceeded < Error
28
+ attr_reader :rate_limit
29
+ attr_reader :rate_remaining
30
+ attr_reader :reset_time_left
31
+
32
+ def initialize(rate_limit, rate_remaining, reset_time)
33
+ end
34
+ end
35
+
36
+ define_setting :temp_storage_location, "tmp/mosquito"
37
+ define_setting :nitter_url, ENV["NITTER_URL"]
38
+ define_setting :save_media, true
39
+
40
+ # The general fields to always return for Users
41
+ def self.user_fields
42
+ "name,created_at,location,profile_image_url,protected,public_metrics,url,username,verified,withheld,description"
43
+ end
44
+
45
+ # The general fields to always return for Tweets
46
+ def self.tweet_fields
47
+ "attachments,author_id,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang"
48
+ end
49
+
50
+ # Get media from a URL and save to a temp folder set in the configuration under
51
+ # temp_storage_location
52
+ def self.retrieve_media(url)
53
+ return "" if !Mosquito.save_media
54
+
55
+ response = Typhoeus.get(url)
56
+
57
+ # Get the file extension if it's in the file
58
+ extension = url.split(".").last
59
+
60
+ # Do some basic checks so we just empty out if there's something weird in the file extension
61
+ # that could do some harm.
62
+ if extension.length.positive?
63
+ extension = extension[0...extension.index("?")]
64
+ extension = nil unless /^[a-zA-Z0-9]+$/.match?(extension)
65
+ extension = ".#{extension}" unless extension.nil?
66
+ end
67
+
68
+ temp_file_name = "#{Mosquito.temp_storage_location}/#{SecureRandom.uuid}#{extension}"
69
+
70
+ # We do this in case the folder isn't created yet, since it's a temp folder we'll just do so
71
+ self.create_temp_storage_location
72
+ File.binwrite(temp_file_name, response.body)
73
+ temp_file_name
74
+ end
75
+
76
+ private
77
+
78
+ def self.create_temp_storage_location
79
+ return if File.exist?(Mosquito.temp_storage_location) && File.directory?(Mosquito.temp_storage_location)
80
+ FileUtils.mkdir_p Mosquito.temp_storage_location
81
+ end
82
+ end
data/mosquito.gemspec ADDED
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/mosquito/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "mosquito-scrape"
7
+ spec.version = Mosquito::VERSION
8
+ spec.authors = ["Christopher Guess"]
9
+ spec.email = ["cguess@gmail.com"]
10
+
11
+ spec.summary = "A gem to scrape a Nitter instance for Twitter data"
12
+ # spec.description = "TODO: Write a longer description or delete this line."
13
+ # spec.homepage = "TODO: Put your gem's website or public repo URL here."
14
+ spec.license = "MIT"
15
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.7.0")
16
+
17
+ # spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
18
+
19
+ # spec.metadata["homepage_uri"] = spec.homepage
20
+ # spec.metadata["source_code_uri"] = "TODO: Put your gem's public repo URL here."
21
+ # spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
22
+
23
+ # Specify which files should be added to the gem when it is released.
24
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
25
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
26
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
27
+ end
28
+ spec.bindir = "exe"
29
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
30
+ spec.require_paths = ["lib"]
31
+
32
+ # Prod dependencies
33
+ spec.add_dependency "typhoeus", "~> 1.4"
34
+ spec.add_dependency "nokogiri", "~> 1.15.5"
35
+ spec.add_dependency "capybara", "~> 3.39"
36
+ spec.add_dependency "dotenv", "~> 2.8"
37
+ spec.add_dependency "oj", "~> 3.16"
38
+ spec.add_dependency "fileutils", "~> 1.7"
39
+ spec.add_dependency "logger", "~> 1.6"
40
+ spec.add_dependency "securerandom", "~> 0.3"
41
+ spec.add_dependency "selenium-webdriver", "~> 4"
42
+ spec.add_dependency "open-uri", "~> 0.4"
43
+ spec.add_dependency "activesupport", "~> 7.0.8"
44
+ spec.add_dependency "rack", "~> 2"
45
+
46
+ # Dev dependencies
47
+ spec.add_development_dependency "byebug", "~> 11.0"
48
+ spec.add_development_dependency "rake", "~> 13.0"
49
+ spec.add_development_dependency "minitest", "~> 5"
50
+ spec.add_development_dependency "rubocop", "~> 1.0"
51
+ spec.add_development_dependency "rubocop-rails", "~> 2.0"
52
+ spec.add_development_dependency "rubocop-rails_config", "~> 1.0"
53
+ spec.add_development_dependency "rubocop-performance", "~> 1.0"
54
+ spec.add_development_dependency "dotenv", "~> 2.0"
55
+
56
+ # For more information and examples about making a new gem, checkout our
57
+ # guide at: https://bundler.io/guides/creating_gem.html
58
+ end