birdsong 0.2.1 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 22fda7c7cab000c5a34df61c63e8ee422037ee23a7e7a238a9dff4065bfb6527
4
- data.tar.gz: 789744baff3ff5803d99f9c6de95773093bb7199e5c20fd9424faefc87d3bbb9
3
+ metadata.gz: 91cec2f940312e308a14a0f84049908d0a38cc9ef53548c0ef9907c5c46bffeb
4
+ data.tar.gz: a7871d2629fa2eec4bb9ee73b635bf974f1ec5f7c0606070e446b6937c60fe9d
5
5
  SHA512:
6
- metadata.gz: f754cc423c11cf4829f07887567499b0dc537e9755e66351f9106306f8ed9593c43ff351aae297ae0c12c157ac73331157ac2f216b3386c35df8d612d66643c5
7
- data.tar.gz: e1d7725cc7c1c9625d2d9b996e8e24a861c83502c6615d72cd686979dbfda969940dd957900eff0d21d6045d3210a9fe2a6354a03d787452a4d0fadcc3250a07
6
+ metadata.gz: 72b073c5546b6a4b22184809d4b1e8381bdd18b07f577f49d6c096ecded7e8b08ff546730a913cb8f41945036f9372bfb503b655eda24bdcff9ba924f9e7b81a
7
+ data.tar.gz: fa6f487b45dc2b284773d4dbccefefeacb74200b2b0f14341a187d899bc82e2048a65e50524e9577e05e0cd9f1a4f0d7f0b8bbdc37d3075a643e143aa7dc9064
@@ -0,0 +1,52 @@
1
+ require "logger"
2
+ require "selenium-webdriver"
3
+
4
+ # Design taken from https://blog.appsignal.com/2021/08/24/responsible-monkeypatching-in-ruby.html
5
+
6
+ module SeleniumMonkeypatch
7
+ class << self
8
+ @@logger = Logger.new(STDOUT)
9
+ @@logger.level = Logger::INFO
10
+
11
+ def apply_patch
12
+ target_class = find_class
13
+ target_method = find_method(target_class)
14
+
15
+ unless target_method
16
+ raise "Could not find class or method when patching Selenium::WebDriver::DevTools.send_cmd"
17
+ end
18
+
19
+ @@logger.info "#{__FILE__} is monkeypatching Selenium::WebDriver::DevTools.send_cmd"
20
+ target_class.prepend(InstanceMethods)
21
+ end
22
+
23
+ private
24
+
25
+ def find_class
26
+ Kernel.const_get("Selenium::WebDriver::DevTools")
27
+ rescue NameError
28
+ end
29
+
30
+ def find_method(class_)
31
+ return unless class_
32
+ class_.instance_method(:send_cmd)
33
+ rescue NameError
34
+ end
35
+ end
36
+
37
+ module InstanceMethods
38
+ # We're monkeypatching the following method so that Selenium doesn't raise errors when we fail to call `continue` on requests
39
+ def send_cmd(method, **params)
40
+ data = { method: method, params: params.compact }
41
+ data[:sessionId] = @session_id if @session_id
42
+ message = @ws.send_cmd(**data)
43
+ if message.nil? == false && message["error"] && (method != "Fetch.continueRequest")
44
+ raise Birdsong::Error::WebDriverError, error_message(message["error"])
45
+ end
46
+
47
+ message
48
+ end
49
+ end
50
+ end
51
+
52
+ SeleniumMonkeypatch.apply_patch
@@ -0,0 +1,224 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "capybara/dsl"
4
+ require "dotenv/load"
5
+ require "oj"
6
+ require "selenium-webdriver"
7
+ require "logger"
8
+ require "securerandom"
9
+ require "selenium/webdriver/remote/http/curb"
10
+ require "debug"
11
+
12
+ # 2022-06-07 14:15:23 WARN Selenium [DEPRECATION] [:browser_options] :options as a parameter for driver initialization is deprecated. Use :capabilities with an Array of value capabilities/options if necessary instead.
13
+
14
+ options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
15
+ options.add_argument("--start-maximized")
16
+ options.add_argument("--no-sandbox")
17
+ options.add_argument("--disable-dev-shm-usage")
18
+ options.add_argument("–-disable-blink-features=AutomationControlled")
19
+ options.add_argument("--disable-extensions")
20
+ options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
21
+ options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
22
+ options.add_preference "password_manager_enabled", false
23
+ options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
24
+
25
+ Capybara.register_driver :selenium_birdsong do |app|
26
+ client = Selenium::WebDriver::Remote::Http::Curb.new
27
+ # client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
28
+ Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
29
+ end
30
+
31
+ Capybara.threadsafe = true
32
+ Capybara.default_max_wait_time = 60
33
+ Capybara.reuse_server = true
34
+
35
+ module Birdsong
36
+ class Scraper # rubocop:disable Metrics/ClassLength
37
+ include Capybara::DSL
38
+
39
+ @@logger = Logger.new(STDOUT)
40
+ @@logger.level = Logger::WARN
41
+ @@logger.datetime_format = "%Y-%m-%d %H:%M:%S"
42
+ @@session_id = nil
43
+
44
+ def initialize
45
+ Capybara.default_driver = :selenium_birdsong
46
+ end
47
+
48
+ # Instagram uses GraphQL (like most of Facebook I think), and returns an object that actually
49
+ # is used to seed the page. We can just parse this for most things.
50
+ #
51
+ # additional_search_params is a comma seperated keys
52
+ # example: `data,xdt_api__v1__media__shortcode__web_info,items`
53
+ #
54
+ # @returns Hash a ruby hash of the JSON data
55
+ def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil)
56
+ # So this is fun:
57
+ # For pages marked as misinformation we have to use one method (interception of requrest) and
58
+ # for pages that are not, we can just pull the data straight from the page.
59
+ #
60
+ # How do we figure out which is which?... for now we'll just run through both and see where we
61
+ # go with it.
62
+
63
+ # Our user data no longer lives in the graphql object passed initially with the page.
64
+ # Instead it comes in as part of a subsequent call. This intercepts all calls, checks if it's
65
+ # the one we want, and then moves on.
66
+ response_body = nil
67
+
68
+ page.driver.browser.intercept do |request, &continue|
69
+ # This passes the request forward unmodified, since we only care about the response
70
+ # puts "checking request: #{request.url}"
71
+
72
+ continue.call(request) && next unless request.url.include?(subpage_search)
73
+
74
+
75
+ continue.call(request) do |response|
76
+
77
+ # Check if not a CORS prefetch and finish up if not
78
+ if !response.body.empty? && response.body
79
+ check_passed = true
80
+ unless additional_search_parameters.nil?
81
+ body_to_check = Oj.load(response.body)
82
+
83
+ search_parameters = additional_search_parameters.split(",")
84
+ search_parameters.each_with_index do |key, index|
85
+ break if body_to_check.nil?
86
+
87
+ check_passed = false unless body_to_check.has_key?(key)
88
+ body_to_check = body_to_check[key]
89
+ end
90
+ end
91
+
92
+ response_body = response.body if check_passed == true
93
+ end
94
+ end
95
+ rescue Selenium::WebDriver::Error::WebDriverError
96
+ # Eat them
97
+ end
98
+
99
+ # Now that the intercept is set up, we visit the page we want
100
+ page.driver.browser.navigate.to(url)
101
+ # We wait until the correct intercept is processed or we've waited 60 seconds
102
+ start_time = Time.now
103
+ # puts "Waiting.... #{url}"
104
+
105
+ sleep(rand(1...10))
106
+ while response_body.nil? && (Time.now - start_time) < 60
107
+ sleep(0.1)
108
+ end
109
+
110
+ page.driver.execute_script("window.stop();")
111
+
112
+ raise Birdsong::NoTweetFoundError if response_body.nil?
113
+ Oj.load(response_body)
114
+ end
115
+
116
+ private
117
+
118
+ ##########
119
+ # Set the session to use a new user folder in the options!
120
+ # #####################
121
+ def reset_selenium
122
+ options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
123
+ options.add_argument("--start-maximized")
124
+ options.add_argument("--no-sandbox")
125
+ options.add_argument("--disable-dev-shm-usage")
126
+ options.add_argument("–-disable-blink-features=AutomationControlled")
127
+ options.add_argument("--disable-extensions")
128
+ options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
129
+
130
+ options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
131
+ options.add_preference "password_manager_enabled", false
132
+ options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
133
+ # options.add_argument("--user-data-dir=/tmp/tarun")
134
+
135
+ Capybara.register_driver :selenium do |app|
136
+ client = Selenium::WebDriver::Remote::Http::Curb.new
137
+ # client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
138
+ Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
139
+ end
140
+
141
+ Capybara.current_driver = :selenium
142
+ end
143
+
144
+ # def login
145
+ # # Reset the sessions so that there's nothing laying around
146
+ # page.quit
147
+
148
+ # # Check if we're on a Instagram page already, if not visit it.
149
+ # unless page.driver.browser.current_url.include? "twitter.com"
150
+ # # There seems to be a bug in the Linux ARM64 version of chromedriver where this will properly
151
+ # # navigate but then timeout, crashing it all up. So instead we check and raise the error when
152
+ # # that then fails again.
153
+ # page.driver.browser.navigate.to("https://twitter.com")
154
+ # end
155
+
156
+ # # We don't have to login if we already are
157
+ # begin
158
+ # return if find_field("Search", wait: 10).present?
159
+ # rescue Capybara::ElementNotFound; end
160
+
161
+ # # Check if we're redirected to a login page, if we aren't we're already logged in
162
+ # return unless page.has_xpath?('//*[@id="loginForm"]/div/div[3]/button')
163
+
164
+ # # Try to log in
165
+ # loop_count = 0
166
+ # while loop_count < 5 do
167
+ # fill_in("username", with: ENV["TWITTER_USER_NAME"])
168
+ # fill_in("password", with: ENV["TWITTER_PASSWORD"])
169
+
170
+ # begin
171
+ # click_button("Log in", exact_text: true) # Note: "Log in" (lowercase `in`) instead redirects to Facebook's login page
172
+ # rescue Capybara::ElementNotFound; end # If we can't find it don't break horribly, just keep waiting
173
+
174
+ # break unless has_css?('p[data-testid="login-error-message"', wait: 10)
175
+ # loop_count += 1
176
+ # sleep(rand * 10.3)
177
+ # end
178
+
179
+ # # Sometimes Instagram just... doesn't let you log in
180
+ # raise "Instagram not accessible" if loop_count == 5
181
+
182
+ # # No we don't want to save our login credentials
183
+ # begin
184
+ # click_on("Save Info")
185
+ # rescue Capybara::ElementNotFound; end
186
+ # end
187
+
188
+ def fetch_image(url)
189
+ request = Typhoeus::Request.new(url, followlocation: true)
190
+ request.on_complete do |response|
191
+ if request.success?
192
+ return request.body
193
+ elsif request.timed_out?
194
+ raise Zorki::Error("Fetching image at #{url} timed out")
195
+ else
196
+ raise Zorki::Error("Fetching image at #{url} returned non-successful HTTP server response #{request.code}")
197
+ end
198
+ end
199
+ end
200
+
201
+ # Convert a string to an integer
202
+ def number_string_to_integer(number_string)
203
+ # First we have to remove any commas in the number or else it all breaks
204
+ number_string = number_string.delete(",")
205
+ # Is the last digit not a number? If so, we're going to have to multiply it by some multiplier
206
+ should_expand = /[0-9]/.match(number_string[-1, 1]).nil?
207
+
208
+ # Get the last index and remove the letter at the end if we should expand
209
+ last_index = should_expand ? number_string.length - 1 : number_string.length
210
+ number = number_string[0, last_index].to_f
211
+ multiplier = 1
212
+ # Determine the multiplier depending on the letter indicated
213
+ case number_string[-1, 1]
214
+ when "m"
215
+ multiplier = 1_000_000
216
+ end
217
+
218
+ # Multiply everything and insure we get an integer back
219
+ (number * multiplier).to_i
220
+ end
221
+ end
222
+ end
223
+
224
+ # require_relative "tweet_scraper"
@@ -0,0 +1,112 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "typhoeus"
4
+ require_relative "scraper"
5
+
6
+ module Birdsong
7
+ class TweetScraper < Scraper
8
+ def parse(id)
9
+ # Stuff we need to get from the DOM (implemented is starred):
10
+ # - User *
11
+ # - Text *
12
+ # - Image * / Images * / Video *
13
+ # - Date *
14
+ # - Number of likes *
15
+ # - Hashtags
16
+
17
+ Capybara.app_host = "https://twitter.com"
18
+
19
+ # video slideshows https://www.instagram.com/p/CY7KxwYOFBS/?utm_source=ig_embed&utm_campaign=loading
20
+ # login
21
+ graphql_object = get_content_of_subpage_from_url(
22
+ "https://twitter.com/jack/status/#{id}",
23
+ "/graphql",
24
+ "data,tweetResult,result"
25
+ )
26
+
27
+ graphql_object = graphql_object.first if graphql_object.kind_of?(Array)
28
+ graphql_object = graphql_object["data"]["tweetResult"]["result"]
29
+
30
+ if graphql_object.key?("__typename") && graphql_object["__typename"] == "TweetUnavailable"
31
+ raise Birdsong::NoTweetFoundError
32
+ end
33
+
34
+ text = graphql_object["legacy"]["full_text"]
35
+ date = graphql_object["legacy"]["created_at"]
36
+ id = graphql_object["legacy"]["id_str"]
37
+ number_of_likes = graphql_object["legacy"]["favorite_count"]
38
+ language = graphql_object["legacy"]["lang"]
39
+
40
+ images = []
41
+ videos = []
42
+ video_preview_image = nil
43
+ video_file_type = nil
44
+
45
+ if graphql_object["legacy"]["entities"].key?("media")
46
+ graphql_object["legacy"]["entities"]["media"].each do |media|
47
+ case media["type"]
48
+ when "photo"
49
+ images << Birdsong.retrieve_media(media["media_url_https"])
50
+ when "video"
51
+ video_preview_image = Birdsong.retrieve_media(media["media_url_https"])
52
+ video_variants = media["video_info"]["variants"]
53
+ largest_bitrate_variant = video_variants.sort_by do |variant|
54
+ variant["bitrate"].nil? ? 0 : variant["bitrate"]
55
+ end.last
56
+
57
+ videos << Birdsong.retrieve_media(largest_bitrate_variant["url"])
58
+ video_file_type = "video"
59
+ when "animated_gif"
60
+ video_preview_image = Birdsong.retrieve_media(media["media_url_https"])
61
+ videos << media["video_info"]["variants"].first["url"]
62
+ video_file_type = "animated_gif"
63
+ end
64
+ end
65
+ end
66
+
67
+ screenshot_file = take_screenshot()
68
+
69
+ # This has to run last since it switches pages
70
+ user_object = graphql_object["core"]["user_results"]["result"]
71
+ user = {
72
+ id: user_object["id"],
73
+ name: user_object["legacy"]["name"],
74
+ username: user_object["legacy"]["screen_name"],
75
+ sign_up_date: user_object["legacy"]["created_at"],
76
+ location: user_object["legacy"]["location"],
77
+ profile_image_url: user_object["legacy"]["profile_image_url_https"],
78
+ description: user_object["legacy"]["description"],
79
+ followers_count: user_object["legacy"]["followers_count"],
80
+ following_count: user_object["legacy"]["friends_count"],
81
+ tweet_count: user_object["legacy"]["statuses_count"],
82
+ listed_count: user_object["legacy"]["listed_count"],
83
+ verified: user_object["legacy"]["verified"],
84
+ url: user_object["legacy"]["url"],
85
+ profile_image_file_name: Birdsong.retrieve_media(user_object["legacy"]["profile_image_url_https"])
86
+ }
87
+
88
+ page.quit
89
+
90
+ {
91
+ images: images,
92
+ video: videos,
93
+ video_preview_image: video_preview_image,
94
+ screenshot_file: screenshot_file,
95
+ text: text,
96
+ date: date,
97
+ number_of_likes: number_of_likes,
98
+ user: user,
99
+ id: id,
100
+ language: language,
101
+ video_file_type: video_file_type
102
+ }
103
+ end
104
+
105
+ def take_screenshot
106
+ # First check if a post has a fact check overlay, if so, clear it.
107
+ # The only issue is that this can take *awhile* to search. Not sure what to do about that
108
+ # since it's Instagram's fault for having such a fucked up obfuscated hierarchy # Take the screenshot and return it
109
+ save_screenshot("#{Birdsong.temp_storage_location}/instagram_screenshot_#{SecureRandom.uuid}.png")
110
+ end
111
+ end
112
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Birdsong
4
- VERSION = "0.2.1"
4
+ VERSION = "0.2.4"
5
5
  end
data/lib/birdsong.rb CHANGED
@@ -10,7 +10,7 @@ require "fileutils"
10
10
  require_relative "birdsong/version"
11
11
  require_relative "birdsong/tweet"
12
12
  require_relative "birdsong/user"
13
- # require_relative "birdsong/scrapers/scraper"
13
+ require_relative "birdsong/scrapers/scraper"
14
14
  require_relative "birdsong/scrapers/tweet_scraper"
15
15
 
16
16
  require_relative "birdsong/monkeypatch"
@@ -23,6 +23,7 @@ module Birdsong
23
23
  class InvalidIdError < Error; end
24
24
  class InvalidMediaTypeError < Error; end
25
25
  class NoTweetFoundError < Error; end
26
+ class WebDriverError < Error; end
26
27
  class RateLimitExceeded < Error
27
28
  attr_reader :rate_limit
28
29
  attr_reader :rate_remaining
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: birdsong
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Christopher Guess
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-10-04 00:00:00.000000000 Z
11
+ date: 2024-04-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: typhoeus
@@ -172,6 +172,9 @@ files:
172
172
  - bin/setup
173
173
  - birdsong.gemspec
174
174
  - lib/birdsong.rb
175
+ - lib/birdsong/monkeypatch.rb
176
+ - lib/birdsong/scrapers/scraper.rb
177
+ - lib/birdsong/scrapers/tweet_scraper.rb
175
178
  - lib/birdsong/tweet.rb
176
179
  - lib/birdsong/user.rb
177
180
  - lib/birdsong/version.rb
@@ -197,7 +200,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
197
200
  - !ruby/object:Gem::Version
198
201
  version: '0'
199
202
  requirements: []
200
- rubygems_version: 3.4.19
203
+ rubygems_version: 3.4.20
201
204
  signing_key:
202
205
  specification_version: 4
203
206
  summary: A gem to interface with Twitter's API V2