birdsong 0.2.1 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/birdsong/monkeypatch.rb +52 -0
- data/lib/birdsong/scrapers/scraper.rb +223 -0
- data/lib/birdsong/scrapers/tweet_scraper.rb +112 -0
- data/lib/birdsong/version.rb +1 -1
- data/lib/birdsong.rb +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e225de345219a482d98dc634601169cb2ab42c78cc9105574fb426f34d334980
|
4
|
+
data.tar.gz: 39f26a882f4e5939012fef4f64b30dabe5b28983a8d21fee6723b75a7342f889
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cac7812476ce19901ac91d1701bf8d01772156d1cf1e9e64d4cd17e9a47d1e8cf60e5f8c05b97c387576f0a910a89aeba1aaca9753764107b4cda6f11de97efe
|
7
|
+
data.tar.gz: c50a4320302b0f87ae09be8b445faa52b7935e4177ffa3804de1ade10faad1c4377590750ef3c369d57d977f0e45022db6898dd7a3901426f0cadcfd0007a4e4
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require "logger"
|
2
|
+
require "selenium-webdriver"
|
3
|
+
|
4
|
+
# Design taken from https://blog.appsignal.com/2021/08/24/responsible-monkeypatching-in-ruby.html
|
5
|
+
|
6
|
+
module SeleniumMonkeypatch
|
7
|
+
class << self
|
8
|
+
@@logger = Logger.new(STDOUT)
|
9
|
+
@@logger.level = Logger::INFO
|
10
|
+
|
11
|
+
def apply_patch
|
12
|
+
target_class = find_class
|
13
|
+
target_method = find_method(target_class)
|
14
|
+
|
15
|
+
unless target_method
|
16
|
+
raise "Could not find class or method when patching Selenium::WebDriver::DevTools.send_cmd"
|
17
|
+
end
|
18
|
+
|
19
|
+
@@logger.info "#{__FILE__} is monkeypatching Selenium::WebDriver::DevTools.send_cmd"
|
20
|
+
target_class.prepend(InstanceMethods)
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def find_class
|
26
|
+
Kernel.const_get("Selenium::WebDriver::DevTools")
|
27
|
+
rescue NameError
|
28
|
+
end
|
29
|
+
|
30
|
+
def find_method(class_)
|
31
|
+
return unless class_
|
32
|
+
class_.instance_method(:send_cmd)
|
33
|
+
rescue NameError
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
module InstanceMethods
|
38
|
+
# We're monkeypatching the following method so that Selenium doesn't raise errors when we fail to call `continue` on requests
|
39
|
+
def send_cmd(method, **params)
|
40
|
+
data = { method: method, params: params.compact }
|
41
|
+
data[:sessionId] = @session_id if @session_id
|
42
|
+
message = @ws.send_cmd(**data)
|
43
|
+
if message.nil? == false && message["error"] && (method != "Fetch.continueRequest")
|
44
|
+
raise Birdsong::Error::WebDriverError, error_message(message["error"])
|
45
|
+
end
|
46
|
+
|
47
|
+
message
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
SeleniumMonkeypatch.apply_patch
|
@@ -0,0 +1,223 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "capybara/dsl"
|
4
|
+
require "dotenv/load"
|
5
|
+
require "oj"
|
6
|
+
require "selenium-webdriver"
|
7
|
+
require "logger"
|
8
|
+
require "securerandom"
|
9
|
+
require "selenium/webdriver/remote/http/curb"
|
10
|
+
require "debug"
|
11
|
+
|
12
|
+
# 2022-06-07 14:15:23 WARN Selenium [DEPRECATION] [:browser_options] :options as a parameter for driver initialization is deprecated. Use :capabilities with an Array of value capabilities/options if necessary instead.
|
13
|
+
|
14
|
+
options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
|
15
|
+
options.add_argument("--start-maximized")
|
16
|
+
options.add_argument("--no-sandbox")
|
17
|
+
options.add_argument("--disable-dev-shm-usage")
|
18
|
+
options.add_argument("–-disable-blink-features=AutomationControlled")
|
19
|
+
options.add_argument("--disable-extensions")
|
20
|
+
options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
|
21
|
+
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
|
22
|
+
options.add_preference "password_manager_enabled", false
|
23
|
+
options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
|
24
|
+
|
25
|
+
Capybara.register_driver :selenium_birdsong do |app|
|
26
|
+
client = Selenium::WebDriver::Remote::Http::Curb.new
|
27
|
+
# client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
28
|
+
Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
|
29
|
+
end
|
30
|
+
|
31
|
+
Capybara.threadsafe = true
|
32
|
+
Capybara.default_max_wait_time = 60
|
33
|
+
Capybara.reuse_server = true
|
34
|
+
|
35
|
+
module Birdsong
|
36
|
+
class Scraper # rubocop:disable Metrics/ClassLength
|
37
|
+
include Capybara::DSL
|
38
|
+
|
39
|
+
@@logger = Logger.new(STDOUT)
|
40
|
+
@@logger.level = Logger::WARN
|
41
|
+
@@logger.datetime_format = "%Y-%m-%d %H:%M:%S"
|
42
|
+
@@session_id = nil
|
43
|
+
|
44
|
+
def initialize
|
45
|
+
Capybara.default_driver = :selenium_birdsong
|
46
|
+
end
|
47
|
+
|
48
|
+
# Instagram uses GraphQL (like most of Facebook I think), and returns an object that actually
|
49
|
+
# is used to seed the page. We can just parse this for most things.
|
50
|
+
#
|
51
|
+
# additional_search_params is a comma seperated keys
|
52
|
+
# example: `data,xdt_api__v1__media__shortcode__web_info,items`
|
53
|
+
#
|
54
|
+
# @returns Hash a ruby hash of the JSON data
|
55
|
+
def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil)
|
56
|
+
# So this is fun:
|
57
|
+
# For pages marked as misinformation we have to use one method (interception of requrest) and
|
58
|
+
# for pages that are not, we can just pull the data straight from the page.
|
59
|
+
#
|
60
|
+
# How do we figure out which is which?... for now we'll just run through both and see where we
|
61
|
+
# go with it.
|
62
|
+
|
63
|
+
# Our user data no longer lives in the graphql object passed initially with the page.
|
64
|
+
# Instead it comes in as part of a subsequent call. This intercepts all calls, checks if it's
|
65
|
+
# the one we want, and then moves on.
|
66
|
+
response_body = nil
|
67
|
+
|
68
|
+
page.driver.browser.intercept do |request, &continue|
|
69
|
+
# This passes the request forward unmodified, since we only care about the response
|
70
|
+
# puts "checking request: #{request.url}"
|
71
|
+
|
72
|
+
continue.call(request) && next unless request.url.include?(subpage_search)
|
73
|
+
|
74
|
+
|
75
|
+
continue.call(request) do |response|
|
76
|
+
|
77
|
+
# Check if not a CORS prefetch and finish up if not
|
78
|
+
if !response.body.empty? && response.body
|
79
|
+
check_passed = true
|
80
|
+
unless additional_search_parameters.nil?
|
81
|
+
body_to_check = Oj.load(response.body)
|
82
|
+
|
83
|
+
search_parameters = additional_search_parameters.split(",")
|
84
|
+
search_parameters.each_with_index do |key, index|
|
85
|
+
break if body_to_check.nil?
|
86
|
+
|
87
|
+
check_passed = false unless body_to_check.has_key?(key)
|
88
|
+
body_to_check = body_to_check[key]
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
response_body = response.body if check_passed == true
|
93
|
+
end
|
94
|
+
end
|
95
|
+
rescue Selenium::WebDriver::Error::WebDriverError
|
96
|
+
# Eat them
|
97
|
+
end
|
98
|
+
|
99
|
+
# Now that the intercept is set up, we visit the page we want
|
100
|
+
page.driver.browser.navigate.to(url)
|
101
|
+
# We wait until the correct intercept is processed or we've waited 60 seconds
|
102
|
+
start_time = Time.now
|
103
|
+
# puts "Waiting.... #{url}"
|
104
|
+
|
105
|
+
sleep(rand(1...10))
|
106
|
+
while response_body.nil? && (Time.now - start_time) < 60
|
107
|
+
sleep(0.1)
|
108
|
+
end
|
109
|
+
|
110
|
+
page.driver.execute_script("window.stop();")
|
111
|
+
raise Birdsong::NoTweetFoundError if response_body.nil?
|
112
|
+
Oj.load(response_body)
|
113
|
+
end
|
114
|
+
|
115
|
+
private
|
116
|
+
|
117
|
+
##########
|
118
|
+
# Set the session to use a new user folder in the options!
|
119
|
+
# #####################
|
120
|
+
def reset_selenium
|
121
|
+
options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
|
122
|
+
options.add_argument("--start-maximized")
|
123
|
+
options.add_argument("--no-sandbox")
|
124
|
+
options.add_argument("--disable-dev-shm-usage")
|
125
|
+
options.add_argument("–-disable-blink-features=AutomationControlled")
|
126
|
+
options.add_argument("--disable-extensions")
|
127
|
+
options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
|
128
|
+
|
129
|
+
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
|
130
|
+
options.add_preference "password_manager_enabled", false
|
131
|
+
options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
|
132
|
+
# options.add_argument("--user-data-dir=/tmp/tarun")
|
133
|
+
|
134
|
+
Capybara.register_driver :selenium do |app|
|
135
|
+
client = Selenium::WebDriver::Remote::Http::Curb.new
|
136
|
+
# client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
137
|
+
Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
|
138
|
+
end
|
139
|
+
|
140
|
+
Capybara.current_driver = :selenium
|
141
|
+
end
|
142
|
+
|
143
|
+
def login
|
144
|
+
# Reset the sessions so that there's nothing laying around
|
145
|
+
page.quit
|
146
|
+
|
147
|
+
# Check if we're on a Instagram page already, if not visit it.
|
148
|
+
unless page.driver.browser.current_url.include? "instagram.com"
|
149
|
+
# There seems to be a bug in the Linux ARM64 version of chromedriver where this will properly
|
150
|
+
# navigate but then timeout, crashing it all up. So instead we check and raise the error when
|
151
|
+
# that then fails again.
|
152
|
+
page.driver.browser.navigate.to("https://instagram.com")
|
153
|
+
end
|
154
|
+
|
155
|
+
# We don't have to login if we already are
|
156
|
+
begin
|
157
|
+
return if find_field("Search", wait: 10).present?
|
158
|
+
rescue Capybara::ElementNotFound; end
|
159
|
+
|
160
|
+
# Check if we're redirected to a login page, if we aren't we're already logged in
|
161
|
+
return unless page.has_xpath?('//*[@id="loginForm"]/div/div[3]/button')
|
162
|
+
|
163
|
+
# Try to log in
|
164
|
+
loop_count = 0
|
165
|
+
while loop_count < 5 do
|
166
|
+
fill_in("username", with: ENV["INSTAGRAM_USER_NAME"])
|
167
|
+
fill_in("password", with: ENV["INSTAGRAM_PASSWORD"])
|
168
|
+
|
169
|
+
begin
|
170
|
+
click_button("Log in", exact_text: true) # Note: "Log in" (lowercase `in`) instead redirects to Facebook's login page
|
171
|
+
rescue Capybara::ElementNotFound; end # If we can't find it don't break horribly, just keep waiting
|
172
|
+
|
173
|
+
break unless has_css?('p[data-testid="login-error-message"', wait: 10)
|
174
|
+
loop_count += 1
|
175
|
+
sleep(rand * 10.3)
|
176
|
+
end
|
177
|
+
|
178
|
+
# Sometimes Instagram just... doesn't let you log in
|
179
|
+
raise "Instagram not accessible" if loop_count == 5
|
180
|
+
|
181
|
+
# No we don't want to save our login credentials
|
182
|
+
begin
|
183
|
+
click_on("Save Info")
|
184
|
+
rescue Capybara::ElementNotFound; end
|
185
|
+
end
|
186
|
+
|
187
|
+
def fetch_image(url)
|
188
|
+
request = Typhoeus::Request.new(url, followlocation: true)
|
189
|
+
request.on_complete do |response|
|
190
|
+
if request.success?
|
191
|
+
return request.body
|
192
|
+
elsif request.timed_out?
|
193
|
+
raise Zorki::Error("Fetching image at #{url} timed out")
|
194
|
+
else
|
195
|
+
raise Zorki::Error("Fetching image at #{url} returned non-successful HTTP server response #{request.code}")
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
# Convert a string to an integer
|
201
|
+
def number_string_to_integer(number_string)
|
202
|
+
# First we have to remove any commas in the number or else it all breaks
|
203
|
+
number_string = number_string.delete(",")
|
204
|
+
# Is the last digit not a number? If so, we're going to have to multiply it by some multiplier
|
205
|
+
should_expand = /[0-9]/.match(number_string[-1, 1]).nil?
|
206
|
+
|
207
|
+
# Get the last index and remove the letter at the end if we should expand
|
208
|
+
last_index = should_expand ? number_string.length - 1 : number_string.length
|
209
|
+
number = number_string[0, last_index].to_f
|
210
|
+
multiplier = 1
|
211
|
+
# Determine the multiplier depending on the letter indicated
|
212
|
+
case number_string[-1, 1]
|
213
|
+
when "m"
|
214
|
+
multiplier = 1_000_000
|
215
|
+
end
|
216
|
+
|
217
|
+
# Multiply everything and insure we get an integer back
|
218
|
+
(number * multiplier).to_i
|
219
|
+
end
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
# require_relative "tweet_scraper"
|
@@ -0,0 +1,112 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "typhoeus"
|
4
|
+
require_relative "scraper"
|
5
|
+
|
6
|
+
module Birdsong
|
7
|
+
class TweetScraper < Scraper
|
8
|
+
def parse(id)
|
9
|
+
# Stuff we need to get from the DOM (implemented is starred):
|
10
|
+
# - User *
|
11
|
+
# - Text *
|
12
|
+
# - Image * / Images * / Video *
|
13
|
+
# - Date *
|
14
|
+
# - Number of likes *
|
15
|
+
# - Hashtags
|
16
|
+
|
17
|
+
Capybara.app_host = "https://twitter.com"
|
18
|
+
|
19
|
+
# video slideshows https://www.instagram.com/p/CY7KxwYOFBS/?utm_source=ig_embed&utm_campaign=loading
|
20
|
+
# login
|
21
|
+
graphql_object = get_content_of_subpage_from_url(
|
22
|
+
"https://twitter.com/jack/status/#{id}",
|
23
|
+
"/graphql",
|
24
|
+
"data,tweetResult,result"
|
25
|
+
)
|
26
|
+
|
27
|
+
graphql_object = graphql_object.first if graphql_object.kind_of?(Array)
|
28
|
+
graphql_object = graphql_object["data"]["tweetResult"]["result"]
|
29
|
+
|
30
|
+
if graphql_object.key?("__typename") && graphql_object["__typename"] == "TweetUnavailable"
|
31
|
+
raise Birdsong::NoTweetFoundError
|
32
|
+
end
|
33
|
+
|
34
|
+
text = graphql_object["legacy"]["full_text"]
|
35
|
+
date = graphql_object["legacy"]["created_at"]
|
36
|
+
id = graphql_object["legacy"]["id_str"]
|
37
|
+
number_of_likes = graphql_object["legacy"]["favorite_count"]
|
38
|
+
language = graphql_object["legacy"]["lang"]
|
39
|
+
|
40
|
+
images = []
|
41
|
+
videos = []
|
42
|
+
video_preview_image = nil
|
43
|
+
video_file_type = nil
|
44
|
+
|
45
|
+
if graphql_object["legacy"]["entities"].key?("media")
|
46
|
+
graphql_object["legacy"]["entities"]["media"].each do |media|
|
47
|
+
case media["type"]
|
48
|
+
when "photo"
|
49
|
+
images << Birdsong.retrieve_media(media["media_url_https"])
|
50
|
+
when "video"
|
51
|
+
video_preview_image = Birdsong.retrieve_media(media["media_url_https"])
|
52
|
+
video_variants = media["video_info"]["variants"]
|
53
|
+
largest_bitrate_variant = video_variants.sort_by do |variant|
|
54
|
+
variant["bitrate"].nil? ? 0 : variant["bitrate"]
|
55
|
+
end.last
|
56
|
+
|
57
|
+
videos << Birdsong.retrieve_media(largest_bitrate_variant["url"])
|
58
|
+
video_file_type = "video"
|
59
|
+
when "animated_gif"
|
60
|
+
video_preview_image = Birdsong.retrieve_media(media["media_url_https"])
|
61
|
+
videos << media["video_info"]["variants"].first["url"]
|
62
|
+
video_file_type = "animated_gif"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
screenshot_file = take_screenshot()
|
68
|
+
|
69
|
+
# This has to run last since it switches pages
|
70
|
+
user_object = graphql_object["core"]["user_results"]["result"]
|
71
|
+
user = {
|
72
|
+
id: user_object["id"],
|
73
|
+
name: user_object["legacy"]["name"],
|
74
|
+
username: user_object["legacy"]["screen_name"],
|
75
|
+
sign_up_date: user_object["legacy"]["created_at"],
|
76
|
+
location: user_object["legacy"]["location"],
|
77
|
+
profile_image_url: user_object["legacy"]["profile_image_url_https"],
|
78
|
+
description: user_object["legacy"]["description"],
|
79
|
+
followers_count: user_object["legacy"]["followers_count"],
|
80
|
+
following_count: user_object["legacy"]["friends_count"],
|
81
|
+
tweet_count: user_object["legacy"]["statuses_count"],
|
82
|
+
listed_count: user_object["legacy"]["listed_count"],
|
83
|
+
verified: user_object["legacy"]["verified"],
|
84
|
+
url: user_object["legacy"]["url"],
|
85
|
+
profile_image_file_name: Birdsong.retrieve_media(user_object["legacy"]["profile_image_url_https"])
|
86
|
+
}
|
87
|
+
|
88
|
+
page.quit
|
89
|
+
|
90
|
+
{
|
91
|
+
images: images,
|
92
|
+
video: videos,
|
93
|
+
video_preview_image: video_preview_image,
|
94
|
+
screenshot_file: screenshot_file,
|
95
|
+
text: text,
|
96
|
+
date: date,
|
97
|
+
number_of_likes: number_of_likes,
|
98
|
+
user: user,
|
99
|
+
id: id,
|
100
|
+
language: language,
|
101
|
+
video_file_type: video_file_type
|
102
|
+
}
|
103
|
+
end
|
104
|
+
|
105
|
+
def take_screenshot
|
106
|
+
# First check if a post has a fact check overlay, if so, clear it.
|
107
|
+
# The only issue is that this can take *awhile* to search. Not sure what to do about that
|
108
|
+
# since it's Instagram's fault for having such a fucked up obfuscated hierarchy # Take the screenshot and return it
|
109
|
+
save_screenshot("#{Birdsong.temp_storage_location}/instagram_screenshot_#{SecureRandom.uuid}.png")
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
data/lib/birdsong/version.rb
CHANGED
data/lib/birdsong.rb
CHANGED
@@ -10,7 +10,7 @@ require "fileutils"
|
|
10
10
|
require_relative "birdsong/version"
|
11
11
|
require_relative "birdsong/tweet"
|
12
12
|
require_relative "birdsong/user"
|
13
|
-
|
13
|
+
require_relative "birdsong/scrapers/scraper"
|
14
14
|
require_relative "birdsong/scrapers/tweet_scraper"
|
15
15
|
|
16
16
|
require_relative "birdsong/monkeypatch"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: birdsong
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Christopher Guess
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-10-
|
11
|
+
date: 2023-10-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: typhoeus
|
@@ -172,6 +172,9 @@ files:
|
|
172
172
|
- bin/setup
|
173
173
|
- birdsong.gemspec
|
174
174
|
- lib/birdsong.rb
|
175
|
+
- lib/birdsong/monkeypatch.rb
|
176
|
+
- lib/birdsong/scrapers/scraper.rb
|
177
|
+
- lib/birdsong/scrapers/tweet_scraper.rb
|
175
178
|
- lib/birdsong/tweet.rb
|
176
179
|
- lib/birdsong/user.rb
|
177
180
|
- lib/birdsong/version.rb
|