birdsong 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/birdsong/monkeypatch.rb +52 -0
- data/lib/birdsong/scrapers/scraper.rb +223 -0
- data/lib/birdsong/scrapers/tweet_scraper.rb +112 -0
- data/lib/birdsong/version.rb +1 -1
- data/lib/birdsong.rb +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e225de345219a482d98dc634601169cb2ab42c78cc9105574fb426f34d334980
|
4
|
+
data.tar.gz: 39f26a882f4e5939012fef4f64b30dabe5b28983a8d21fee6723b75a7342f889
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cac7812476ce19901ac91d1701bf8d01772156d1cf1e9e64d4cd17e9a47d1e8cf60e5f8c05b97c387576f0a910a89aeba1aaca9753764107b4cda6f11de97efe
|
7
|
+
data.tar.gz: c50a4320302b0f87ae09be8b445faa52b7935e4177ffa3804de1ade10faad1c4377590750ef3c369d57d977f0e45022db6898dd7a3901426f0cadcfd0007a4e4
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require "logger"
|
2
|
+
require "selenium-webdriver"
|
3
|
+
|
4
|
+
# Design taken from https://blog.appsignal.com/2021/08/24/responsible-monkeypatching-in-ruby.html
|
5
|
+
|
6
|
+
module SeleniumMonkeypatch
|
7
|
+
class << self
|
8
|
+
@@logger = Logger.new(STDOUT)
|
9
|
+
@@logger.level = Logger::INFO
|
10
|
+
|
11
|
+
def apply_patch
|
12
|
+
target_class = find_class
|
13
|
+
target_method = find_method(target_class)
|
14
|
+
|
15
|
+
unless target_method
|
16
|
+
raise "Could not find class or method when patching Selenium::WebDriver::DevTools.send_cmd"
|
17
|
+
end
|
18
|
+
|
19
|
+
@@logger.info "#{__FILE__} is monkeypatching Selenium::WebDriver::DevTools.send_cmd"
|
20
|
+
target_class.prepend(InstanceMethods)
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def find_class
|
26
|
+
Kernel.const_get("Selenium::WebDriver::DevTools")
|
27
|
+
rescue NameError
|
28
|
+
end
|
29
|
+
|
30
|
+
def find_method(class_)
|
31
|
+
return unless class_
|
32
|
+
class_.instance_method(:send_cmd)
|
33
|
+
rescue NameError
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
module InstanceMethods
|
38
|
+
# We're monkeypatching the following method so that Selenium doesn't raise errors when we fail to call `continue` on requests
|
39
|
+
def send_cmd(method, **params)
|
40
|
+
data = { method: method, params: params.compact }
|
41
|
+
data[:sessionId] = @session_id if @session_id
|
42
|
+
message = @ws.send_cmd(**data)
|
43
|
+
if message.nil? == false && message["error"] && (method != "Fetch.continueRequest")
|
44
|
+
raise Birdsong::Error::WebDriverError, error_message(message["error"])
|
45
|
+
end
|
46
|
+
|
47
|
+
message
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
SeleniumMonkeypatch.apply_patch
|
@@ -0,0 +1,223 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "capybara/dsl"
|
4
|
+
require "dotenv/load"
|
5
|
+
require "oj"
|
6
|
+
require "selenium-webdriver"
|
7
|
+
require "logger"
|
8
|
+
require "securerandom"
|
9
|
+
require "selenium/webdriver/remote/http/curb"
|
10
|
+
require "debug"
|
11
|
+
|
12
|
+
# 2022-06-07 14:15:23 WARN Selenium [DEPRECATION] [:browser_options] :options as a parameter for driver initialization is deprecated. Use :capabilities with an Array of value capabilities/options if necessary instead.
|
13
|
+
|
14
|
+
options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
|
15
|
+
options.add_argument("--start-maximized")
|
16
|
+
options.add_argument("--no-sandbox")
|
17
|
+
options.add_argument("--disable-dev-shm-usage")
|
18
|
+
options.add_argument("–-disable-blink-features=AutomationControlled")
|
19
|
+
options.add_argument("--disable-extensions")
|
20
|
+
options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
|
21
|
+
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
|
22
|
+
options.add_preference "password_manager_enabled", false
|
23
|
+
options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
|
24
|
+
|
25
|
+
Capybara.register_driver :selenium_birdsong do |app|
|
26
|
+
client = Selenium::WebDriver::Remote::Http::Curb.new
|
27
|
+
# client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
28
|
+
Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
|
29
|
+
end
|
30
|
+
|
31
|
+
Capybara.threadsafe = true
|
32
|
+
Capybara.default_max_wait_time = 60
|
33
|
+
Capybara.reuse_server = true
|
34
|
+
|
35
|
+
module Birdsong
|
36
|
+
class Scraper # rubocop:disable Metrics/ClassLength
|
37
|
+
include Capybara::DSL
|
38
|
+
|
39
|
+
@@logger = Logger.new(STDOUT)
|
40
|
+
@@logger.level = Logger::WARN
|
41
|
+
@@logger.datetime_format = "%Y-%m-%d %H:%M:%S"
|
42
|
+
@@session_id = nil
|
43
|
+
|
44
|
+
def initialize
|
45
|
+
Capybara.default_driver = :selenium_birdsong
|
46
|
+
end
|
47
|
+
|
48
|
+
# Instagram uses GraphQL (like most of Facebook I think), and returns an object that actually
|
49
|
+
# is used to seed the page. We can just parse this for most things.
|
50
|
+
#
|
51
|
+
# additional_search_params is a comma seperated keys
|
52
|
+
# example: `data,xdt_api__v1__media__shortcode__web_info,items`
|
53
|
+
#
|
54
|
+
# @returns Hash a ruby hash of the JSON data
|
55
|
+
def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil)
|
56
|
+
# So this is fun:
|
57
|
+
# For pages marked as misinformation we have to use one method (interception of requrest) and
|
58
|
+
# for pages that are not, we can just pull the data straight from the page.
|
59
|
+
#
|
60
|
+
# How do we figure out which is which?... for now we'll just run through both and see where we
|
61
|
+
# go with it.
|
62
|
+
|
63
|
+
# Our user data no longer lives in the graphql object passed initially with the page.
|
64
|
+
# Instead it comes in as part of a subsequent call. This intercepts all calls, checks if it's
|
65
|
+
# the one we want, and then moves on.
|
66
|
+
response_body = nil
|
67
|
+
|
68
|
+
page.driver.browser.intercept do |request, &continue|
|
69
|
+
# This passes the request forward unmodified, since we only care about the response
|
70
|
+
# puts "checking request: #{request.url}"
|
71
|
+
|
72
|
+
continue.call(request) && next unless request.url.include?(subpage_search)
|
73
|
+
|
74
|
+
|
75
|
+
continue.call(request) do |response|
|
76
|
+
|
77
|
+
# Check if not a CORS prefetch and finish up if not
|
78
|
+
if !response.body.empty? && response.body
|
79
|
+
check_passed = true
|
80
|
+
unless additional_search_parameters.nil?
|
81
|
+
body_to_check = Oj.load(response.body)
|
82
|
+
|
83
|
+
search_parameters = additional_search_parameters.split(",")
|
84
|
+
search_parameters.each_with_index do |key, index|
|
85
|
+
break if body_to_check.nil?
|
86
|
+
|
87
|
+
check_passed = false unless body_to_check.has_key?(key)
|
88
|
+
body_to_check = body_to_check[key]
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
response_body = response.body if check_passed == true
|
93
|
+
end
|
94
|
+
end
|
95
|
+
rescue Selenium::WebDriver::Error::WebDriverError
|
96
|
+
# Eat them
|
97
|
+
end
|
98
|
+
|
99
|
+
# Now that the intercept is set up, we visit the page we want
|
100
|
+
page.driver.browser.navigate.to(url)
|
101
|
+
# We wait until the correct intercept is processed or we've waited 60 seconds
|
102
|
+
start_time = Time.now
|
103
|
+
# puts "Waiting.... #{url}"
|
104
|
+
|
105
|
+
sleep(rand(1...10))
|
106
|
+
while response_body.nil? && (Time.now - start_time) < 60
|
107
|
+
sleep(0.1)
|
108
|
+
end
|
109
|
+
|
110
|
+
page.driver.execute_script("window.stop();")
|
111
|
+
raise Birdsong::NoTweetFoundError if response_body.nil?
|
112
|
+
Oj.load(response_body)
|
113
|
+
end
|
114
|
+
|
115
|
+
private
|
116
|
+
|
117
|
+
##########
|
118
|
+
# Set the session to use a new user folder in the options!
|
119
|
+
# #####################
|
120
|
+
def reset_selenium
|
121
|
+
options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
|
122
|
+
options.add_argument("--start-maximized")
|
123
|
+
options.add_argument("--no-sandbox")
|
124
|
+
options.add_argument("--disable-dev-shm-usage")
|
125
|
+
options.add_argument("–-disable-blink-features=AutomationControlled")
|
126
|
+
options.add_argument("--disable-extensions")
|
127
|
+
options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
|
128
|
+
|
129
|
+
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
|
130
|
+
options.add_preference "password_manager_enabled", false
|
131
|
+
options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
|
132
|
+
# options.add_argument("--user-data-dir=/tmp/tarun")
|
133
|
+
|
134
|
+
Capybara.register_driver :selenium do |app|
|
135
|
+
client = Selenium::WebDriver::Remote::Http::Curb.new
|
136
|
+
# client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
137
|
+
Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
|
138
|
+
end
|
139
|
+
|
140
|
+
Capybara.current_driver = :selenium
|
141
|
+
end
|
142
|
+
|
143
|
+
def login
|
144
|
+
# Reset the sessions so that there's nothing laying around
|
145
|
+
page.quit
|
146
|
+
|
147
|
+
# Check if we're on a Instagram page already, if not visit it.
|
148
|
+
unless page.driver.browser.current_url.include? "instagram.com"
|
149
|
+
# There seems to be a bug in the Linux ARM64 version of chromedriver where this will properly
|
150
|
+
# navigate but then timeout, crashing it all up. So instead we check and raise the error when
|
151
|
+
# that then fails again.
|
152
|
+
page.driver.browser.navigate.to("https://instagram.com")
|
153
|
+
end
|
154
|
+
|
155
|
+
# We don't have to login if we already are
|
156
|
+
begin
|
157
|
+
return if find_field("Search", wait: 10).present?
|
158
|
+
rescue Capybara::ElementNotFound; end
|
159
|
+
|
160
|
+
# Check if we're redirected to a login page, if we aren't we're already logged in
|
161
|
+
return unless page.has_xpath?('//*[@id="loginForm"]/div/div[3]/button')
|
162
|
+
|
163
|
+
# Try to log in
|
164
|
+
loop_count = 0
|
165
|
+
while loop_count < 5 do
|
166
|
+
fill_in("username", with: ENV["INSTAGRAM_USER_NAME"])
|
167
|
+
fill_in("password", with: ENV["INSTAGRAM_PASSWORD"])
|
168
|
+
|
169
|
+
begin
|
170
|
+
click_button("Log in", exact_text: true) # Note: "Log in" (lowercase `in`) instead redirects to Facebook's login page
|
171
|
+
rescue Capybara::ElementNotFound; end # If we can't find it don't break horribly, just keep waiting
|
172
|
+
|
173
|
+
break unless has_css?('p[data-testid="login-error-message"', wait: 10)
|
174
|
+
loop_count += 1
|
175
|
+
sleep(rand * 10.3)
|
176
|
+
end
|
177
|
+
|
178
|
+
# Sometimes Instagram just... doesn't let you log in
|
179
|
+
raise "Instagram not accessible" if loop_count == 5
|
180
|
+
|
181
|
+
# No we don't want to save our login credentials
|
182
|
+
begin
|
183
|
+
click_on("Save Info")
|
184
|
+
rescue Capybara::ElementNotFound; end
|
185
|
+
end
|
186
|
+
|
187
|
+
def fetch_image(url)
|
188
|
+
request = Typhoeus::Request.new(url, followlocation: true)
|
189
|
+
request.on_complete do |response|
|
190
|
+
if request.success?
|
191
|
+
return request.body
|
192
|
+
elsif request.timed_out?
|
193
|
+
raise Zorki::Error("Fetching image at #{url} timed out")
|
194
|
+
else
|
195
|
+
raise Zorki::Error("Fetching image at #{url} returned non-successful HTTP server response #{request.code}")
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
# Convert a string to an integer
|
201
|
+
def number_string_to_integer(number_string)
|
202
|
+
# First we have to remove any commas in the number or else it all breaks
|
203
|
+
number_string = number_string.delete(",")
|
204
|
+
# Is the last digit not a number? If so, we're going to have to multiply it by some multiplier
|
205
|
+
should_expand = /[0-9]/.match(number_string[-1, 1]).nil?
|
206
|
+
|
207
|
+
# Get the last index and remove the letter at the end if we should expand
|
208
|
+
last_index = should_expand ? number_string.length - 1 : number_string.length
|
209
|
+
number = number_string[0, last_index].to_f
|
210
|
+
multiplier = 1
|
211
|
+
# Determine the multiplier depending on the letter indicated
|
212
|
+
case number_string[-1, 1]
|
213
|
+
when "m"
|
214
|
+
multiplier = 1_000_000
|
215
|
+
end
|
216
|
+
|
217
|
+
# Multiply everything and insure we get an integer back
|
218
|
+
(number * multiplier).to_i
|
219
|
+
end
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
# require_relative "tweet_scraper"
|
@@ -0,0 +1,112 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "typhoeus"
|
4
|
+
require_relative "scraper"
|
5
|
+
|
6
|
+
module Birdsong
|
7
|
+
class TweetScraper < Scraper
|
8
|
+
def parse(id)
|
9
|
+
# Stuff we need to get from the DOM (implemented is starred):
|
10
|
+
# - User *
|
11
|
+
# - Text *
|
12
|
+
# - Image * / Images * / Video *
|
13
|
+
# - Date *
|
14
|
+
# - Number of likes *
|
15
|
+
# - Hashtags
|
16
|
+
|
17
|
+
Capybara.app_host = "https://twitter.com"
|
18
|
+
|
19
|
+
# video slideshows https://www.instagram.com/p/CY7KxwYOFBS/?utm_source=ig_embed&utm_campaign=loading
|
20
|
+
# login
|
21
|
+
graphql_object = get_content_of_subpage_from_url(
|
22
|
+
"https://twitter.com/jack/status/#{id}",
|
23
|
+
"/graphql",
|
24
|
+
"data,tweetResult,result"
|
25
|
+
)
|
26
|
+
|
27
|
+
graphql_object = graphql_object.first if graphql_object.kind_of?(Array)
|
28
|
+
graphql_object = graphql_object["data"]["tweetResult"]["result"]
|
29
|
+
|
30
|
+
if graphql_object.key?("__typename") && graphql_object["__typename"] == "TweetUnavailable"
|
31
|
+
raise Birdsong::NoTweetFoundError
|
32
|
+
end
|
33
|
+
|
34
|
+
text = graphql_object["legacy"]["full_text"]
|
35
|
+
date = graphql_object["legacy"]["created_at"]
|
36
|
+
id = graphql_object["legacy"]["id_str"]
|
37
|
+
number_of_likes = graphql_object["legacy"]["favorite_count"]
|
38
|
+
language = graphql_object["legacy"]["lang"]
|
39
|
+
|
40
|
+
images = []
|
41
|
+
videos = []
|
42
|
+
video_preview_image = nil
|
43
|
+
video_file_type = nil
|
44
|
+
|
45
|
+
if graphql_object["legacy"]["entities"].key?("media")
|
46
|
+
graphql_object["legacy"]["entities"]["media"].each do |media|
|
47
|
+
case media["type"]
|
48
|
+
when "photo"
|
49
|
+
images << Birdsong.retrieve_media(media["media_url_https"])
|
50
|
+
when "video"
|
51
|
+
video_preview_image = Birdsong.retrieve_media(media["media_url_https"])
|
52
|
+
video_variants = media["video_info"]["variants"]
|
53
|
+
largest_bitrate_variant = video_variants.sort_by do |variant|
|
54
|
+
variant["bitrate"].nil? ? 0 : variant["bitrate"]
|
55
|
+
end.last
|
56
|
+
|
57
|
+
videos << Birdsong.retrieve_media(largest_bitrate_variant["url"])
|
58
|
+
video_file_type = "video"
|
59
|
+
when "animated_gif"
|
60
|
+
video_preview_image = Birdsong.retrieve_media(media["media_url_https"])
|
61
|
+
videos << media["video_info"]["variants"].first["url"]
|
62
|
+
video_file_type = "animated_gif"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
screenshot_file = take_screenshot()
|
68
|
+
|
69
|
+
# This has to run last since it switches pages
|
70
|
+
user_object = graphql_object["core"]["user_results"]["result"]
|
71
|
+
user = {
|
72
|
+
id: user_object["id"],
|
73
|
+
name: user_object["legacy"]["name"],
|
74
|
+
username: user_object["legacy"]["screen_name"],
|
75
|
+
sign_up_date: user_object["legacy"]["created_at"],
|
76
|
+
location: user_object["legacy"]["location"],
|
77
|
+
profile_image_url: user_object["legacy"]["profile_image_url_https"],
|
78
|
+
description: user_object["legacy"]["description"],
|
79
|
+
followers_count: user_object["legacy"]["followers_count"],
|
80
|
+
following_count: user_object["legacy"]["friends_count"],
|
81
|
+
tweet_count: user_object["legacy"]["statuses_count"],
|
82
|
+
listed_count: user_object["legacy"]["listed_count"],
|
83
|
+
verified: user_object["legacy"]["verified"],
|
84
|
+
url: user_object["legacy"]["url"],
|
85
|
+
profile_image_file_name: Birdsong.retrieve_media(user_object["legacy"]["profile_image_url_https"])
|
86
|
+
}
|
87
|
+
|
88
|
+
page.quit
|
89
|
+
|
90
|
+
{
|
91
|
+
images: images,
|
92
|
+
video: videos,
|
93
|
+
video_preview_image: video_preview_image,
|
94
|
+
screenshot_file: screenshot_file,
|
95
|
+
text: text,
|
96
|
+
date: date,
|
97
|
+
number_of_likes: number_of_likes,
|
98
|
+
user: user,
|
99
|
+
id: id,
|
100
|
+
language: language,
|
101
|
+
video_file_type: video_file_type
|
102
|
+
}
|
103
|
+
end
|
104
|
+
|
105
|
+
def take_screenshot
|
106
|
+
# First check if a post has a fact check overlay, if so, clear it.
|
107
|
+
# The only issue is that this can take *awhile* to search. Not sure what to do about that
|
108
|
+
# since it's Instagram's fault for having such a fucked up obfuscated hierarchy # Take the screenshot and return it
|
109
|
+
save_screenshot("#{Birdsong.temp_storage_location}/instagram_screenshot_#{SecureRandom.uuid}.png")
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
data/lib/birdsong/version.rb
CHANGED
data/lib/birdsong.rb
CHANGED
@@ -10,7 +10,7 @@ require "fileutils"
|
|
10
10
|
require_relative "birdsong/version"
|
11
11
|
require_relative "birdsong/tweet"
|
12
12
|
require_relative "birdsong/user"
|
13
|
-
|
13
|
+
require_relative "birdsong/scrapers/scraper"
|
14
14
|
require_relative "birdsong/scrapers/tweet_scraper"
|
15
15
|
|
16
16
|
require_relative "birdsong/monkeypatch"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: birdsong
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Christopher Guess
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-10-
|
11
|
+
date: 2023-10-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: typhoeus
|
@@ -172,6 +172,9 @@ files:
|
|
172
172
|
- bin/setup
|
173
173
|
- birdsong.gemspec
|
174
174
|
- lib/birdsong.rb
|
175
|
+
- lib/birdsong/monkeypatch.rb
|
176
|
+
- lib/birdsong/scrapers/scraper.rb
|
177
|
+
- lib/birdsong/scrapers/tweet_scraper.rb
|
175
178
|
- lib/birdsong/tweet.rb
|
176
179
|
- lib/birdsong/user.rb
|
177
180
|
- lib/birdsong/version.rb
|