birdsong 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 22fda7c7cab000c5a34df61c63e8ee422037ee23a7e7a238a9dff4065bfb6527
4
- data.tar.gz: 789744baff3ff5803d99f9c6de95773093bb7199e5c20fd9424faefc87d3bbb9
3
+ metadata.gz: e225de345219a482d98dc634601169cb2ab42c78cc9105574fb426f34d334980
4
+ data.tar.gz: 39f26a882f4e5939012fef4f64b30dabe5b28983a8d21fee6723b75a7342f889
5
5
  SHA512:
6
- metadata.gz: f754cc423c11cf4829f07887567499b0dc537e9755e66351f9106306f8ed9593c43ff351aae297ae0c12c157ac73331157ac2f216b3386c35df8d612d66643c5
7
- data.tar.gz: e1d7725cc7c1c9625d2d9b996e8e24a861c83502c6615d72cd686979dbfda969940dd957900eff0d21d6045d3210a9fe2a6354a03d787452a4d0fadcc3250a07
6
+ metadata.gz: cac7812476ce19901ac91d1701bf8d01772156d1cf1e9e64d4cd17e9a47d1e8cf60e5f8c05b97c387576f0a910a89aeba1aaca9753764107b4cda6f11de97efe
7
+ data.tar.gz: c50a4320302b0f87ae09be8b445faa52b7935e4177ffa3804de1ade10faad1c4377590750ef3c369d57d977f0e45022db6898dd7a3901426f0cadcfd0007a4e4
@@ -0,0 +1,52 @@
1
+ require "logger"
2
+ require "selenium-webdriver"
3
+
4
+ # Design taken from https://blog.appsignal.com/2021/08/24/responsible-monkeypatching-in-ruby.html
5
+
6
+ module SeleniumMonkeypatch
7
+ class << self
8
+ @@logger = Logger.new(STDOUT)
9
+ @@logger.level = Logger::INFO
10
+
11
+ def apply_patch
12
+ target_class = find_class
13
+ target_method = find_method(target_class)
14
+
15
+ unless target_method
16
+ raise "Could not find class or method when patching Selenium::WebDriver::DevTools.send_cmd"
17
+ end
18
+
19
+ @@logger.info "#{__FILE__} is monkeypatching Selenium::WebDriver::DevTools.send_cmd"
20
+ target_class.prepend(InstanceMethods)
21
+ end
22
+
23
+ private
24
+
25
+ def find_class
26
+ Kernel.const_get("Selenium::WebDriver::DevTools")
27
+ rescue NameError
28
+ end
29
+
30
+ def find_method(class_)
31
+ return unless class_
32
+ class_.instance_method(:send_cmd)
33
+ rescue NameError
34
+ end
35
+ end
36
+
37
+ module InstanceMethods
38
+ # We're monkeypatching the following method so that Selenium doesn't raise errors when we fail to call `continue` on requests
39
+ def send_cmd(method, **params)
40
+ data = { method: method, params: params.compact }
41
+ data[:sessionId] = @session_id if @session_id
42
+ message = @ws.send_cmd(**data)
43
+ if message.nil? == false && message["error"] && (method != "Fetch.continueRequest")
44
+ raise Birdsong::Error::WebDriverError, error_message(message["error"])
45
+ end
46
+
47
+ message
48
+ end
49
+ end
50
+ end
51
+
52
+ SeleniumMonkeypatch.apply_patch
@@ -0,0 +1,223 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "capybara/dsl"
4
+ require "dotenv/load"
5
+ require "oj"
6
+ require "selenium-webdriver"
7
+ require "logger"
8
+ require "securerandom"
9
+ require "selenium/webdriver/remote/http/curb"
10
+ require "debug"
11
+
12
+ # 2022-06-07 14:15:23 WARN Selenium [DEPRECATION] [:browser_options] :options as a parameter for driver initialization is deprecated. Use :capabilities with an Array of value capabilities/options if necessary instead.
13
+
14
+ options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
15
+ options.add_argument("--start-maximized")
16
+ options.add_argument("--no-sandbox")
17
+ options.add_argument("--disable-dev-shm-usage")
18
+ options.add_argument("–-disable-blink-features=AutomationControlled")
19
+ options.add_argument("--disable-extensions")
20
+ options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
21
+ options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
22
+ options.add_preference "password_manager_enabled", false
23
+ options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
24
+
25
+ Capybara.register_driver :selenium_birdsong do |app|
26
+ client = Selenium::WebDriver::Remote::Http::Curb.new
27
+ # client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
28
+ Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
29
+ end
30
+
31
+ Capybara.threadsafe = true
32
+ Capybara.default_max_wait_time = 60
33
+ Capybara.reuse_server = true
34
+
35
+ module Birdsong
36
+ class Scraper # rubocop:disable Metrics/ClassLength
37
+ include Capybara::DSL
38
+
39
+ @@logger = Logger.new(STDOUT)
40
+ @@logger.level = Logger::WARN
41
+ @@logger.datetime_format = "%Y-%m-%d %H:%M:%S"
42
+ @@session_id = nil
43
+
44
+ def initialize
45
+ Capybara.default_driver = :selenium_birdsong
46
+ end
47
+
48
+ # Instagram uses GraphQL (like most of Facebook I think), and returns an object that actually
49
+ # is used to seed the page. We can just parse this for most things.
50
+ #
51
+ # additional_search_params is a comma seperated keys
52
+ # example: `data,xdt_api__v1__media__shortcode__web_info,items`
53
+ #
54
+ # @returns Hash a ruby hash of the JSON data
55
+ def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil)
56
+ # So this is fun:
57
+ # For pages marked as misinformation we have to use one method (interception of requrest) and
58
+ # for pages that are not, we can just pull the data straight from the page.
59
+ #
60
+ # How do we figure out which is which?... for now we'll just run through both and see where we
61
+ # go with it.
62
+
63
+ # Our user data no longer lives in the graphql object passed initially with the page.
64
+ # Instead it comes in as part of a subsequent call. This intercepts all calls, checks if it's
65
+ # the one we want, and then moves on.
66
+ response_body = nil
67
+
68
+ page.driver.browser.intercept do |request, &continue|
69
+ # This passes the request forward unmodified, since we only care about the response
70
+ # puts "checking request: #{request.url}"
71
+
72
+ continue.call(request) && next unless request.url.include?(subpage_search)
73
+
74
+
75
+ continue.call(request) do |response|
76
+
77
+ # Check if not a CORS prefetch and finish up if not
78
+ if !response.body.empty? && response.body
79
+ check_passed = true
80
+ unless additional_search_parameters.nil?
81
+ body_to_check = Oj.load(response.body)
82
+
83
+ search_parameters = additional_search_parameters.split(",")
84
+ search_parameters.each_with_index do |key, index|
85
+ break if body_to_check.nil?
86
+
87
+ check_passed = false unless body_to_check.has_key?(key)
88
+ body_to_check = body_to_check[key]
89
+ end
90
+ end
91
+
92
+ response_body = response.body if check_passed == true
93
+ end
94
+ end
95
+ rescue Selenium::WebDriver::Error::WebDriverError
96
+ # Eat them
97
+ end
98
+
99
+ # Now that the intercept is set up, we visit the page we want
100
+ page.driver.browser.navigate.to(url)
101
+ # We wait until the correct intercept is processed or we've waited 60 seconds
102
+ start_time = Time.now
103
+ # puts "Waiting.... #{url}"
104
+
105
+ sleep(rand(1...10))
106
+ while response_body.nil? && (Time.now - start_time) < 60
107
+ sleep(0.1)
108
+ end
109
+
110
+ page.driver.execute_script("window.stop();")
111
+ raise Birdsong::NoTweetFoundError if response_body.nil?
112
+ Oj.load(response_body)
113
+ end
114
+
115
+ private
116
+
117
+ ##########
118
+ # Set the session to use a new user folder in the options!
119
+ # #####################
120
+ def reset_selenium
121
+ options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
122
+ options.add_argument("--start-maximized")
123
+ options.add_argument("--no-sandbox")
124
+ options.add_argument("--disable-dev-shm-usage")
125
+ options.add_argument("–-disable-blink-features=AutomationControlled")
126
+ options.add_argument("--disable-extensions")
127
+ options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
128
+
129
+ options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
130
+ options.add_preference "password_manager_enabled", false
131
+ options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
132
+ # options.add_argument("--user-data-dir=/tmp/tarun")
133
+
134
+ Capybara.register_driver :selenium do |app|
135
+ client = Selenium::WebDriver::Remote::Http::Curb.new
136
+ # client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
137
+ Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
138
+ end
139
+
140
+ Capybara.current_driver = :selenium
141
+ end
142
+
143
+ def login
144
+ # Reset the sessions so that there's nothing laying around
145
+ page.quit
146
+
147
+ # Check if we're on a Instagram page already, if not visit it.
148
+ unless page.driver.browser.current_url.include? "instagram.com"
149
+ # There seems to be a bug in the Linux ARM64 version of chromedriver where this will properly
150
+ # navigate but then timeout, crashing it all up. So instead we check and raise the error when
151
+ # that then fails again.
152
+ page.driver.browser.navigate.to("https://instagram.com")
153
+ end
154
+
155
+ # We don't have to login if we already are
156
+ begin
157
+ return if find_field("Search", wait: 10).present?
158
+ rescue Capybara::ElementNotFound; end
159
+
160
+ # Check if we're redirected to a login page, if we aren't we're already logged in
161
+ return unless page.has_xpath?('//*[@id="loginForm"]/div/div[3]/button')
162
+
163
+ # Try to log in
164
+ loop_count = 0
165
+ while loop_count < 5 do
166
+ fill_in("username", with: ENV["INSTAGRAM_USER_NAME"])
167
+ fill_in("password", with: ENV["INSTAGRAM_PASSWORD"])
168
+
169
+ begin
170
+ click_button("Log in", exact_text: true) # Note: "Log in" (lowercase `in`) instead redirects to Facebook's login page
171
+ rescue Capybara::ElementNotFound; end # If we can't find it don't break horribly, just keep waiting
172
+
173
+ break unless has_css?('p[data-testid="login-error-message"', wait: 10)
174
+ loop_count += 1
175
+ sleep(rand * 10.3)
176
+ end
177
+
178
+ # Sometimes Instagram just... doesn't let you log in
179
+ raise "Instagram not accessible" if loop_count == 5
180
+
181
+ # No we don't want to save our login credentials
182
+ begin
183
+ click_on("Save Info")
184
+ rescue Capybara::ElementNotFound; end
185
+ end
186
+
187
+ def fetch_image(url)
188
+ request = Typhoeus::Request.new(url, followlocation: true)
189
+ request.on_complete do |response|
190
+ if request.success?
191
+ return request.body
192
+ elsif request.timed_out?
193
+ raise Zorki::Error("Fetching image at #{url} timed out")
194
+ else
195
+ raise Zorki::Error("Fetching image at #{url} returned non-successful HTTP server response #{request.code}")
196
+ end
197
+ end
198
+ end
199
+
200
+ # Convert a string to an integer
201
+ def number_string_to_integer(number_string)
202
+ # First we have to remove any commas in the number or else it all breaks
203
+ number_string = number_string.delete(",")
204
+ # Is the last digit not a number? If so, we're going to have to multiply it by some multiplier
205
+ should_expand = /[0-9]/.match(number_string[-1, 1]).nil?
206
+
207
+ # Get the last index and remove the letter at the end if we should expand
208
+ last_index = should_expand ? number_string.length - 1 : number_string.length
209
+ number = number_string[0, last_index].to_f
210
+ multiplier = 1
211
+ # Determine the multiplier depending on the letter indicated
212
+ case number_string[-1, 1]
213
+ when "m"
214
+ multiplier = 1_000_000
215
+ end
216
+
217
+ # Multiply everything and insure we get an integer back
218
+ (number * multiplier).to_i
219
+ end
220
+ end
221
+ end
222
+
223
+ # require_relative "tweet_scraper"
@@ -0,0 +1,112 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "typhoeus"
4
+ require_relative "scraper"
5
+
6
+ module Birdsong
7
+ class TweetScraper < Scraper
8
+ def parse(id)
9
+ # Stuff we need to get from the DOM (implemented is starred):
10
+ # - User *
11
+ # - Text *
12
+ # - Image * / Images * / Video *
13
+ # - Date *
14
+ # - Number of likes *
15
+ # - Hashtags
16
+
17
+ Capybara.app_host = "https://twitter.com"
18
+
19
+ # video slideshows https://www.instagram.com/p/CY7KxwYOFBS/?utm_source=ig_embed&utm_campaign=loading
20
+ # login
21
+ graphql_object = get_content_of_subpage_from_url(
22
+ "https://twitter.com/jack/status/#{id}",
23
+ "/graphql",
24
+ "data,tweetResult,result"
25
+ )
26
+
27
+ graphql_object = graphql_object.first if graphql_object.kind_of?(Array)
28
+ graphql_object = graphql_object["data"]["tweetResult"]["result"]
29
+
30
+ if graphql_object.key?("__typename") && graphql_object["__typename"] == "TweetUnavailable"
31
+ raise Birdsong::NoTweetFoundError
32
+ end
33
+
34
+ text = graphql_object["legacy"]["full_text"]
35
+ date = graphql_object["legacy"]["created_at"]
36
+ id = graphql_object["legacy"]["id_str"]
37
+ number_of_likes = graphql_object["legacy"]["favorite_count"]
38
+ language = graphql_object["legacy"]["lang"]
39
+
40
+ images = []
41
+ videos = []
42
+ video_preview_image = nil
43
+ video_file_type = nil
44
+
45
+ if graphql_object["legacy"]["entities"].key?("media")
46
+ graphql_object["legacy"]["entities"]["media"].each do |media|
47
+ case media["type"]
48
+ when "photo"
49
+ images << Birdsong.retrieve_media(media["media_url_https"])
50
+ when "video"
51
+ video_preview_image = Birdsong.retrieve_media(media["media_url_https"])
52
+ video_variants = media["video_info"]["variants"]
53
+ largest_bitrate_variant = video_variants.sort_by do |variant|
54
+ variant["bitrate"].nil? ? 0 : variant["bitrate"]
55
+ end.last
56
+
57
+ videos << Birdsong.retrieve_media(largest_bitrate_variant["url"])
58
+ video_file_type = "video"
59
+ when "animated_gif"
60
+ video_preview_image = Birdsong.retrieve_media(media["media_url_https"])
61
+ videos << media["video_info"]["variants"].first["url"]
62
+ video_file_type = "animated_gif"
63
+ end
64
+ end
65
+ end
66
+
67
+ screenshot_file = take_screenshot()
68
+
69
+ # This has to run last since it switches pages
70
+ user_object = graphql_object["core"]["user_results"]["result"]
71
+ user = {
72
+ id: user_object["id"],
73
+ name: user_object["legacy"]["name"],
74
+ username: user_object["legacy"]["screen_name"],
75
+ sign_up_date: user_object["legacy"]["created_at"],
76
+ location: user_object["legacy"]["location"],
77
+ profile_image_url: user_object["legacy"]["profile_image_url_https"],
78
+ description: user_object["legacy"]["description"],
79
+ followers_count: user_object["legacy"]["followers_count"],
80
+ following_count: user_object["legacy"]["friends_count"],
81
+ tweet_count: user_object["legacy"]["statuses_count"],
82
+ listed_count: user_object["legacy"]["listed_count"],
83
+ verified: user_object["legacy"]["verified"],
84
+ url: user_object["legacy"]["url"],
85
+ profile_image_file_name: Birdsong.retrieve_media(user_object["legacy"]["profile_image_url_https"])
86
+ }
87
+
88
+ page.quit
89
+
90
+ {
91
+ images: images,
92
+ video: videos,
93
+ video_preview_image: video_preview_image,
94
+ screenshot_file: screenshot_file,
95
+ text: text,
96
+ date: date,
97
+ number_of_likes: number_of_likes,
98
+ user: user,
99
+ id: id,
100
+ language: language,
101
+ video_file_type: video_file_type
102
+ }
103
+ end
104
+
105
+ def take_screenshot
106
+ # First check if a post has a fact check overlay, if so, clear it.
107
+ # The only issue is that this can take *awhile* to search. Not sure what to do about that
108
+ # since it's Instagram's fault for having such a fucked up obfuscated hierarchy # Take the screenshot and return it
109
+ save_screenshot("#{Birdsong.temp_storage_location}/instagram_screenshot_#{SecureRandom.uuid}.png")
110
+ end
111
+ end
112
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Birdsong
4
- VERSION = "0.2.1"
4
+ VERSION = "0.2.3"
5
5
  end
data/lib/birdsong.rb CHANGED
@@ -10,7 +10,7 @@ require "fileutils"
10
10
  require_relative "birdsong/version"
11
11
  require_relative "birdsong/tweet"
12
12
  require_relative "birdsong/user"
13
- # require_relative "birdsong/scrapers/scraper"
13
+ require_relative "birdsong/scrapers/scraper"
14
14
  require_relative "birdsong/scrapers/tweet_scraper"
15
15
 
16
16
  require_relative "birdsong/monkeypatch"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: birdsong
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Christopher Guess
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-10-04 00:00:00.000000000 Z
11
+ date: 2023-10-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: typhoeus
@@ -172,6 +172,9 @@ files:
172
172
  - bin/setup
173
173
  - birdsong.gemspec
174
174
  - lib/birdsong.rb
175
+ - lib/birdsong/monkeypatch.rb
176
+ - lib/birdsong/scrapers/scraper.rb
177
+ - lib/birdsong/scrapers/tweet_scraper.rb
175
178
  - lib/birdsong/tweet.rb
176
179
  - lib/birdsong/user.rb
177
180
  - lib/birdsong/version.rb