birdsong 0.2.1 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 22fda7c7cab000c5a34df61c63e8ee422037ee23a7e7a238a9dff4065bfb6527
4
- data.tar.gz: 789744baff3ff5803d99f9c6de95773093bb7199e5c20fd9424faefc87d3bbb9
3
+ metadata.gz: 91cec2f940312e308a14a0f84049908d0a38cc9ef53548c0ef9907c5c46bffeb
4
+ data.tar.gz: a7871d2629fa2eec4bb9ee73b635bf974f1ec5f7c0606070e446b6937c60fe9d
5
5
  SHA512:
6
- metadata.gz: f754cc423c11cf4829f07887567499b0dc537e9755e66351f9106306f8ed9593c43ff351aae297ae0c12c157ac73331157ac2f216b3386c35df8d612d66643c5
7
- data.tar.gz: e1d7725cc7c1c9625d2d9b996e8e24a861c83502c6615d72cd686979dbfda969940dd957900eff0d21d6045d3210a9fe2a6354a03d787452a4d0fadcc3250a07
6
+ metadata.gz: 72b073c5546b6a4b22184809d4b1e8381bdd18b07f577f49d6c096ecded7e8b08ff546730a913cb8f41945036f9372bfb503b655eda24bdcff9ba924f9e7b81a
7
+ data.tar.gz: fa6f487b45dc2b284773d4dbccefefeacb74200b2b0f14341a187d899bc82e2048a65e50524e9577e05e0cd9f1a4f0d7f0b8bbdc37d3075a643e143aa7dc9064
@@ -0,0 +1,52 @@
1
+ require "logger"
2
+ require "selenium-webdriver"
3
+
4
+ # Design taken from https://blog.appsignal.com/2021/08/24/responsible-monkeypatching-in-ruby.html
5
+
6
+ module SeleniumMonkeypatch
7
+ class << self
8
+ @@logger = Logger.new(STDOUT)
9
+ @@logger.level = Logger::INFO
10
+
11
+ def apply_patch
12
+ target_class = find_class
13
+ target_method = find_method(target_class)
14
+
15
+ unless target_method
16
+ raise "Could not find class or method when patching Selenium::WebDriver::DevTools.send_cmd"
17
+ end
18
+
19
+ @@logger.info "#{__FILE__} is monkeypatching Selenium::WebDriver::DevTools.send_cmd"
20
+ target_class.prepend(InstanceMethods)
21
+ end
22
+
23
+ private
24
+
25
+ def find_class
26
+ Kernel.const_get("Selenium::WebDriver::DevTools")
27
+ rescue NameError
28
+ end
29
+
30
+ def find_method(class_)
31
+ return unless class_
32
+ class_.instance_method(:send_cmd)
33
+ rescue NameError
34
+ end
35
+ end
36
+
37
+ module InstanceMethods
38
+ # We're monkeypatching the following method so that Selenium doesn't raise errors when we fail to call `continue` on requests
39
+ def send_cmd(method, **params)
40
+ data = { method: method, params: params.compact }
41
+ data[:sessionId] = @session_id if @session_id
42
+ message = @ws.send_cmd(**data)
43
+ if message.nil? == false && message["error"] && (method != "Fetch.continueRequest")
44
+ raise Birdsong::Error::WebDriverError, error_message(message["error"])
45
+ end
46
+
47
+ message
48
+ end
49
+ end
50
+ end
51
+
52
+ SeleniumMonkeypatch.apply_patch
@@ -0,0 +1,224 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "capybara/dsl"
4
+ require "dotenv/load"
5
+ require "oj"
6
+ require "selenium-webdriver"
7
+ require "logger"
8
+ require "securerandom"
9
+ require "selenium/webdriver/remote/http/curb"
10
+ require "debug"
11
+
12
+ # 2022-06-07 14:15:23 WARN Selenium [DEPRECATION] [:browser_options] :options as a parameter for driver initialization is deprecated. Use :capabilities with an Array of value capabilities/options if necessary instead.
13
+
14
+ options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
15
+ options.add_argument("--start-maximized")
16
+ options.add_argument("--no-sandbox")
17
+ options.add_argument("--disable-dev-shm-usage")
18
+ options.add_argument("–-disable-blink-features=AutomationControlled")
19
+ options.add_argument("--disable-extensions")
20
+ options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
21
+ options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
22
+ options.add_preference "password_manager_enabled", false
23
+ options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
24
+
25
+ Capybara.register_driver :selenium_birdsong do |app|
26
+ client = Selenium::WebDriver::Remote::Http::Curb.new
27
+ # client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
28
+ Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
29
+ end
30
+
31
+ Capybara.threadsafe = true
32
+ Capybara.default_max_wait_time = 60
33
+ Capybara.reuse_server = true
34
+
35
+ module Birdsong
36
+ class Scraper # rubocop:disable Metrics/ClassLength
37
+ include Capybara::DSL
38
+
39
+ @@logger = Logger.new(STDOUT)
40
+ @@logger.level = Logger::WARN
41
+ @@logger.datetime_format = "%Y-%m-%d %H:%M:%S"
42
+ @@session_id = nil
43
+
44
+ def initialize
45
+ Capybara.default_driver = :selenium_birdsong
46
+ end
47
+
48
+ # Instagram uses GraphQL (like most of Facebook I think), and returns an object that actually
49
+ # is used to seed the page. We can just parse this for most things.
50
+ #
51
+ # additional_search_params is a comma seperated keys
52
+ # example: `data,xdt_api__v1__media__shortcode__web_info,items`
53
+ #
54
+ # @returns Hash a ruby hash of the JSON data
55
+ def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil)
56
+ # So this is fun:
57
+ # For pages marked as misinformation we have to use one method (interception of requrest) and
58
+ # for pages that are not, we can just pull the data straight from the page.
59
+ #
60
+ # How do we figure out which is which?... for now we'll just run through both and see where we
61
+ # go with it.
62
+
63
+ # Our user data no longer lives in the graphql object passed initially with the page.
64
+ # Instead it comes in as part of a subsequent call. This intercepts all calls, checks if it's
65
+ # the one we want, and then moves on.
66
+ response_body = nil
67
+
68
+ page.driver.browser.intercept do |request, &continue|
69
+ # This passes the request forward unmodified, since we only care about the response
70
+ # puts "checking request: #{request.url}"
71
+
72
+ continue.call(request) && next unless request.url.include?(subpage_search)
73
+
74
+
75
+ continue.call(request) do |response|
76
+
77
+ # Check if not a CORS prefetch and finish up if not
78
+ if !response.body.empty? && response.body
79
+ check_passed = true
80
+ unless additional_search_parameters.nil?
81
+ body_to_check = Oj.load(response.body)
82
+
83
+ search_parameters = additional_search_parameters.split(",")
84
+ search_parameters.each_with_index do |key, index|
85
+ break if body_to_check.nil?
86
+
87
+ check_passed = false unless body_to_check.has_key?(key)
88
+ body_to_check = body_to_check[key]
89
+ end
90
+ end
91
+
92
+ response_body = response.body if check_passed == true
93
+ end
94
+ end
95
+ rescue Selenium::WebDriver::Error::WebDriverError
96
+ # Eat them
97
+ end
98
+
99
+ # Now that the intercept is set up, we visit the page we want
100
+ page.driver.browser.navigate.to(url)
101
+ # We wait until the correct intercept is processed or we've waited 60 seconds
102
+ start_time = Time.now
103
+ # puts "Waiting.... #{url}"
104
+
105
+ sleep(rand(1...10))
106
+ while response_body.nil? && (Time.now - start_time) < 60
107
+ sleep(0.1)
108
+ end
109
+
110
+ page.driver.execute_script("window.stop();")
111
+
112
+ raise Birdsong::NoTweetFoundError if response_body.nil?
113
+ Oj.load(response_body)
114
+ end
115
+
116
+ private
117
+
118
+ ##########
119
+ # Set the session to use a new user folder in the options!
120
+ # #####################
121
+ def reset_selenium
122
+ options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
123
+ options.add_argument("--start-maximized")
124
+ options.add_argument("--no-sandbox")
125
+ options.add_argument("--disable-dev-shm-usage")
126
+ options.add_argument("–-disable-blink-features=AutomationControlled")
127
+ options.add_argument("--disable-extensions")
128
+ options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
129
+
130
+ options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
131
+ options.add_preference "password_manager_enabled", false
132
+ options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
133
+ # options.add_argument("--user-data-dir=/tmp/tarun")
134
+
135
+ Capybara.register_driver :selenium do |app|
136
+ client = Selenium::WebDriver::Remote::Http::Curb.new
137
+ # client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
138
+ Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
139
+ end
140
+
141
+ Capybara.current_driver = :selenium
142
+ end
143
+
144
+ # def login
145
+ # # Reset the sessions so that there's nothing laying around
146
+ # page.quit
147
+
148
+ # # Check if we're on a Instagram page already, if not visit it.
149
+ # unless page.driver.browser.current_url.include? "twitter.com"
150
+ # # There seems to be a bug in the Linux ARM64 version of chromedriver where this will properly
151
+ # # navigate but then timeout, crashing it all up. So instead we check and raise the error when
152
+ # # that then fails again.
153
+ # page.driver.browser.navigate.to("https://twitter.com")
154
+ # end
155
+
156
+ # # We don't have to login if we already are
157
+ # begin
158
+ # return if find_field("Search", wait: 10).present?
159
+ # rescue Capybara::ElementNotFound; end
160
+
161
+ # # Check if we're redirected to a login page, if we aren't we're already logged in
162
+ # return unless page.has_xpath?('//*[@id="loginForm"]/div/div[3]/button')
163
+
164
+ # # Try to log in
165
+ # loop_count = 0
166
+ # while loop_count < 5 do
167
+ # fill_in("username", with: ENV["TWITTER_USER_NAME"])
168
+ # fill_in("password", with: ENV["TWITTER_PASSWORD"])
169
+
170
+ # begin
171
+ # click_button("Log in", exact_text: true) # Note: "Log in" (lowercase `in`) instead redirects to Facebook's login page
172
+ # rescue Capybara::ElementNotFound; end # If we can't find it don't break horribly, just keep waiting
173
+
174
+ # break unless has_css?('p[data-testid="login-error-message"', wait: 10)
175
+ # loop_count += 1
176
+ # sleep(rand * 10.3)
177
+ # end
178
+
179
+ # # Sometimes Instagram just... doesn't let you log in
180
+ # raise "Instagram not accessible" if loop_count == 5
181
+
182
+ # # No we don't want to save our login credentials
183
+ # begin
184
+ # click_on("Save Info")
185
+ # rescue Capybara::ElementNotFound; end
186
+ # end
187
+
188
+ def fetch_image(url)
189
+ request = Typhoeus::Request.new(url, followlocation: true)
190
+ request.on_complete do |response|
191
+ if request.success?
192
+ return request.body
193
+ elsif request.timed_out?
194
+ raise Zorki::Error("Fetching image at #{url} timed out")
195
+ else
196
+ raise Zorki::Error("Fetching image at #{url} returned non-successful HTTP server response #{request.code}")
197
+ end
198
+ end
199
+ end
200
+
201
+ # Convert a string to an integer
202
+ def number_string_to_integer(number_string)
203
+ # First we have to remove any commas in the number or else it all breaks
204
+ number_string = number_string.delete(",")
205
+ # Is the last digit not a number? If so, we're going to have to multiply it by some multiplier
206
+ should_expand = /[0-9]/.match(number_string[-1, 1]).nil?
207
+
208
+ # Get the last index and remove the letter at the end if we should expand
209
+ last_index = should_expand ? number_string.length - 1 : number_string.length
210
+ number = number_string[0, last_index].to_f
211
+ multiplier = 1
212
+ # Determine the multiplier depending on the letter indicated
213
+ case number_string[-1, 1]
214
+ when "m"
215
+ multiplier = 1_000_000
216
+ end
217
+
218
+ # Multiply everything and insure we get an integer back
219
+ (number * multiplier).to_i
220
+ end
221
+ end
222
+ end
223
+
224
+ # require_relative "tweet_scraper"
@@ -0,0 +1,112 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "typhoeus"
4
+ require_relative "scraper"
5
+
6
+ module Birdsong
7
+ class TweetScraper < Scraper
8
+ def parse(id)
9
+ # Stuff we need to get from the DOM (implemented is starred):
10
+ # - User *
11
+ # - Text *
12
+ # - Image * / Images * / Video *
13
+ # - Date *
14
+ # - Number of likes *
15
+ # - Hashtags
16
+
17
+ Capybara.app_host = "https://twitter.com"
18
+
19
+ # video slideshows https://www.instagram.com/p/CY7KxwYOFBS/?utm_source=ig_embed&utm_campaign=loading
20
+ # login
21
+ graphql_object = get_content_of_subpage_from_url(
22
+ "https://twitter.com/jack/status/#{id}",
23
+ "/graphql",
24
+ "data,tweetResult,result"
25
+ )
26
+
27
+ graphql_object = graphql_object.first if graphql_object.kind_of?(Array)
28
+ graphql_object = graphql_object["data"]["tweetResult"]["result"]
29
+
30
+ if graphql_object.key?("__typename") && graphql_object["__typename"] == "TweetUnavailable"
31
+ raise Birdsong::NoTweetFoundError
32
+ end
33
+
34
+ text = graphql_object["legacy"]["full_text"]
35
+ date = graphql_object["legacy"]["created_at"]
36
+ id = graphql_object["legacy"]["id_str"]
37
+ number_of_likes = graphql_object["legacy"]["favorite_count"]
38
+ language = graphql_object["legacy"]["lang"]
39
+
40
+ images = []
41
+ videos = []
42
+ video_preview_image = nil
43
+ video_file_type = nil
44
+
45
+ if graphql_object["legacy"]["entities"].key?("media")
46
+ graphql_object["legacy"]["entities"]["media"].each do |media|
47
+ case media["type"]
48
+ when "photo"
49
+ images << Birdsong.retrieve_media(media["media_url_https"])
50
+ when "video"
51
+ video_preview_image = Birdsong.retrieve_media(media["media_url_https"])
52
+ video_variants = media["video_info"]["variants"]
53
+ largest_bitrate_variant = video_variants.sort_by do |variant|
54
+ variant["bitrate"].nil? ? 0 : variant["bitrate"]
55
+ end.last
56
+
57
+ videos << Birdsong.retrieve_media(largest_bitrate_variant["url"])
58
+ video_file_type = "video"
59
+ when "animated_gif"
60
+ video_preview_image = Birdsong.retrieve_media(media["media_url_https"])
61
+ videos << media["video_info"]["variants"].first["url"]
62
+ video_file_type = "animated_gif"
63
+ end
64
+ end
65
+ end
66
+
67
+ screenshot_file = take_screenshot()
68
+
69
+ # This has to run last since it switches pages
70
+ user_object = graphql_object["core"]["user_results"]["result"]
71
+ user = {
72
+ id: user_object["id"],
73
+ name: user_object["legacy"]["name"],
74
+ username: user_object["legacy"]["screen_name"],
75
+ sign_up_date: user_object["legacy"]["created_at"],
76
+ location: user_object["legacy"]["location"],
77
+ profile_image_url: user_object["legacy"]["profile_image_url_https"],
78
+ description: user_object["legacy"]["description"],
79
+ followers_count: user_object["legacy"]["followers_count"],
80
+ following_count: user_object["legacy"]["friends_count"],
81
+ tweet_count: user_object["legacy"]["statuses_count"],
82
+ listed_count: user_object["legacy"]["listed_count"],
83
+ verified: user_object["legacy"]["verified"],
84
+ url: user_object["legacy"]["url"],
85
+ profile_image_file_name: Birdsong.retrieve_media(user_object["legacy"]["profile_image_url_https"])
86
+ }
87
+
88
+ page.quit
89
+
90
+ {
91
+ images: images,
92
+ video: videos,
93
+ video_preview_image: video_preview_image,
94
+ screenshot_file: screenshot_file,
95
+ text: text,
96
+ date: date,
97
+ number_of_likes: number_of_likes,
98
+ user: user,
99
+ id: id,
100
+ language: language,
101
+ video_file_type: video_file_type
102
+ }
103
+ end
104
+
105
+ def take_screenshot
106
+ # First check if a post has a fact check overlay, if so, clear it.
107
+ # The only issue is that this can take *awhile* to search. Not sure what to do about that
108
+ # since it's Instagram's fault for having such a fucked up obfuscated hierarchy # Take the screenshot and return it
109
+ save_screenshot("#{Birdsong.temp_storage_location}/instagram_screenshot_#{SecureRandom.uuid}.png")
110
+ end
111
+ end
112
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Birdsong
4
- VERSION = "0.2.1"
4
+ VERSION = "0.2.4"
5
5
  end
data/lib/birdsong.rb CHANGED
@@ -10,7 +10,7 @@ require "fileutils"
10
10
  require_relative "birdsong/version"
11
11
  require_relative "birdsong/tweet"
12
12
  require_relative "birdsong/user"
13
- # require_relative "birdsong/scrapers/scraper"
13
+ require_relative "birdsong/scrapers/scraper"
14
14
  require_relative "birdsong/scrapers/tweet_scraper"
15
15
 
16
16
  require_relative "birdsong/monkeypatch"
@@ -23,6 +23,7 @@ module Birdsong
23
23
  class InvalidIdError < Error; end
24
24
  class InvalidMediaTypeError < Error; end
25
25
  class NoTweetFoundError < Error; end
26
+ class WebDriverError < Error; end
26
27
  class RateLimitExceeded < Error
27
28
  attr_reader :rate_limit
28
29
  attr_reader :rate_remaining
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: birdsong
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Christopher Guess
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-10-04 00:00:00.000000000 Z
11
+ date: 2024-04-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: typhoeus
@@ -172,6 +172,9 @@ files:
172
172
  - bin/setup
173
173
  - birdsong.gemspec
174
174
  - lib/birdsong.rb
175
+ - lib/birdsong/monkeypatch.rb
176
+ - lib/birdsong/scrapers/scraper.rb
177
+ - lib/birdsong/scrapers/tweet_scraper.rb
175
178
  - lib/birdsong/tweet.rb
176
179
  - lib/birdsong/user.rb
177
180
  - lib/birdsong/version.rb
@@ -197,7 +200,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
197
200
  - !ruby/object:Gem::Version
198
201
  version: '0'
199
202
  requirements: []
200
- rubygems_version: 3.4.19
203
+ rubygems_version: 3.4.20
201
204
  signing_key:
202
205
  specification_version: 4
203
206
  summary: A gem to interface with Twitter's API V2