birdsong 0.2.1 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 22fda7c7cab000c5a34df61c63e8ee422037ee23a7e7a238a9dff4065bfb6527
4
- data.tar.gz: 789744baff3ff5803d99f9c6de95773093bb7199e5c20fd9424faefc87d3bbb9
3
+ metadata.gz: e225de345219a482d98dc634601169cb2ab42c78cc9105574fb426f34d334980
4
+ data.tar.gz: 39f26a882f4e5939012fef4f64b30dabe5b28983a8d21fee6723b75a7342f889
5
5
  SHA512:
6
- metadata.gz: f754cc423c11cf4829f07887567499b0dc537e9755e66351f9106306f8ed9593c43ff351aae297ae0c12c157ac73331157ac2f216b3386c35df8d612d66643c5
7
- data.tar.gz: e1d7725cc7c1c9625d2d9b996e8e24a861c83502c6615d72cd686979dbfda969940dd957900eff0d21d6045d3210a9fe2a6354a03d787452a4d0fadcc3250a07
6
+ metadata.gz: cac7812476ce19901ac91d1701bf8d01772156d1cf1e9e64d4cd17e9a47d1e8cf60e5f8c05b97c387576f0a910a89aeba1aaca9753764107b4cda6f11de97efe
7
+ data.tar.gz: c50a4320302b0f87ae09be8b445faa52b7935e4177ffa3804de1ade10faad1c4377590750ef3c369d57d977f0e45022db6898dd7a3901426f0cadcfd0007a4e4
@@ -0,0 +1,52 @@
1
+ require "logger"
2
+ require "selenium-webdriver"
3
+
4
+ # Design taken from https://blog.appsignal.com/2021/08/24/responsible-monkeypatching-in-ruby.html
5
+
6
+ module SeleniumMonkeypatch
7
+ class << self
8
+ @@logger = Logger.new(STDOUT)
9
+ @@logger.level = Logger::INFO
10
+
11
+ def apply_patch
12
+ target_class = find_class
13
+ target_method = find_method(target_class)
14
+
15
+ unless target_method
16
+ raise "Could not find class or method when patching Selenium::WebDriver::DevTools.send_cmd"
17
+ end
18
+
19
+ @@logger.info "#{__FILE__} is monkeypatching Selenium::WebDriver::DevTools.send_cmd"
20
+ target_class.prepend(InstanceMethods)
21
+ end
22
+
23
+ private
24
+
25
+ def find_class
26
+ Kernel.const_get("Selenium::WebDriver::DevTools")
27
+ rescue NameError
28
+ end
29
+
30
+ def find_method(class_)
31
+ return unless class_
32
+ class_.instance_method(:send_cmd)
33
+ rescue NameError
34
+ end
35
+ end
36
+
37
+ module InstanceMethods
38
+ # We're monkeypatching the following method so that Selenium doesn't raise errors when we fail to call `continue` on requests
39
+ def send_cmd(method, **params)
40
+ data = { method: method, params: params.compact }
41
+ data[:sessionId] = @session_id if @session_id
42
+ message = @ws.send_cmd(**data)
43
+ if message.nil? == false && message["error"] && (method != "Fetch.continueRequest")
44
+ raise Birdsong::Error::WebDriverError, error_message(message["error"])
45
+ end
46
+
47
+ message
48
+ end
49
+ end
50
+ end
51
+
52
+ SeleniumMonkeypatch.apply_patch
@@ -0,0 +1,223 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "capybara/dsl"
4
+ require "dotenv/load"
5
+ require "oj"
6
+ require "selenium-webdriver"
7
+ require "logger"
8
+ require "securerandom"
9
+ require "selenium/webdriver/remote/http/curb"
10
+ require "debug"
11
+
12
+ # 2022-06-07 14:15:23 WARN Selenium [DEPRECATION] [:browser_options] :options as a parameter for driver initialization is deprecated. Use :capabilities with an Array of value capabilities/options if necessary instead.
13
+
14
+ options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
15
+ options.add_argument("--start-maximized")
16
+ options.add_argument("--no-sandbox")
17
+ options.add_argument("--disable-dev-shm-usage")
18
+ options.add_argument("–-disable-blink-features=AutomationControlled")
19
+ options.add_argument("--disable-extensions")
20
+ options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
21
+ options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
22
+ options.add_preference "password_manager_enabled", false
23
+ options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
24
+
25
+ Capybara.register_driver :selenium_birdsong do |app|
26
+ client = Selenium::WebDriver::Remote::Http::Curb.new
27
+ # client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
28
+ Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
29
+ end
30
+
31
+ Capybara.threadsafe = true
32
+ Capybara.default_max_wait_time = 60
33
+ Capybara.reuse_server = true
34
+
35
+ module Birdsong
36
+ class Scraper # rubocop:disable Metrics/ClassLength
37
+ include Capybara::DSL
38
+
39
+ @@logger = Logger.new(STDOUT)
40
+ @@logger.level = Logger::WARN
41
+ @@logger.datetime_format = "%Y-%m-%d %H:%M:%S"
42
+ @@session_id = nil
43
+
44
+ def initialize
45
+ Capybara.default_driver = :selenium_birdsong
46
+ end
47
+
48
+ # Instagram uses GraphQL (like most of Facebook I think), and returns an object that actually
49
+ # is used to seed the page. We can just parse this for most things.
50
+ #
51
+ # additional_search_params is a comma seperated keys
52
+ # example: `data,xdt_api__v1__media__shortcode__web_info,items`
53
+ #
54
+ # @returns Hash a ruby hash of the JSON data
55
+ def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil)
56
+ # So this is fun:
57
+ # For pages marked as misinformation we have to use one method (interception of requrest) and
58
+ # for pages that are not, we can just pull the data straight from the page.
59
+ #
60
+ # How do we figure out which is which?... for now we'll just run through both and see where we
61
+ # go with it.
62
+
63
+ # Our user data no longer lives in the graphql object passed initially with the page.
64
+ # Instead it comes in as part of a subsequent call. This intercepts all calls, checks if it's
65
+ # the one we want, and then moves on.
66
+ response_body = nil
67
+
68
+ page.driver.browser.intercept do |request, &continue|
69
+ # This passes the request forward unmodified, since we only care about the response
70
+ # puts "checking request: #{request.url}"
71
+
72
+ continue.call(request) && next unless request.url.include?(subpage_search)
73
+
74
+
75
+ continue.call(request) do |response|
76
+
77
+ # Check if not a CORS prefetch and finish up if not
78
+ if !response.body.empty? && response.body
79
+ check_passed = true
80
+ unless additional_search_parameters.nil?
81
+ body_to_check = Oj.load(response.body)
82
+
83
+ search_parameters = additional_search_parameters.split(",")
84
+ search_parameters.each_with_index do |key, index|
85
+ break if body_to_check.nil?
86
+
87
+ check_passed = false unless body_to_check.has_key?(key)
88
+ body_to_check = body_to_check[key]
89
+ end
90
+ end
91
+
92
+ response_body = response.body if check_passed == true
93
+ end
94
+ end
95
+ rescue Selenium::WebDriver::Error::WebDriverError
96
+ # Eat them
97
+ end
98
+
99
+ # Now that the intercept is set up, we visit the page we want
100
+ page.driver.browser.navigate.to(url)
101
+ # We wait until the correct intercept is processed or we've waited 60 seconds
102
+ start_time = Time.now
103
+ # puts "Waiting.... #{url}"
104
+
105
+ sleep(rand(1...10))
106
+ while response_body.nil? && (Time.now - start_time) < 60
107
+ sleep(0.1)
108
+ end
109
+
110
+ page.driver.execute_script("window.stop();")
111
+ raise Birdsong::NoTweetFoundError if response_body.nil?
112
+ Oj.load(response_body)
113
+ end
114
+
115
+ private
116
+
117
+ ##########
118
+ # Set the session to use a new user folder in the options!
119
+ # #####################
120
+ def reset_selenium
121
+ options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
122
+ options.add_argument("--start-maximized")
123
+ options.add_argument("--no-sandbox")
124
+ options.add_argument("--disable-dev-shm-usage")
125
+ options.add_argument("–-disable-blink-features=AutomationControlled")
126
+ options.add_argument("--disable-extensions")
127
+ options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
128
+
129
+ options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
130
+ options.add_preference "password_manager_enabled", false
131
+ options.add_argument("--user-data-dir=/tmp/tarun_zorki_#{SecureRandom.uuid}")
132
+ # options.add_argument("--user-data-dir=/tmp/tarun")
133
+
134
+ Capybara.register_driver :selenium do |app|
135
+ client = Selenium::WebDriver::Remote::Http::Curb.new
136
+ # client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
137
+ Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
138
+ end
139
+
140
+ Capybara.current_driver = :selenium
141
+ end
142
+
143
+ def login
144
+ # Reset the sessions so that there's nothing laying around
145
+ page.quit
146
+
147
+ # Check if we're on a Instagram page already, if not visit it.
148
+ unless page.driver.browser.current_url.include? "instagram.com"
149
+ # There seems to be a bug in the Linux ARM64 version of chromedriver where this will properly
150
+ # navigate but then timeout, crashing it all up. So instead we check and raise the error when
151
+ # that then fails again.
152
+ page.driver.browser.navigate.to("https://instagram.com")
153
+ end
154
+
155
+ # We don't have to login if we already are
156
+ begin
157
+ return if find_field("Search", wait: 10).present?
158
+ rescue Capybara::ElementNotFound; end
159
+
160
+ # Check if we're redirected to a login page, if we aren't we're already logged in
161
+ return unless page.has_xpath?('//*[@id="loginForm"]/div/div[3]/button')
162
+
163
+ # Try to log in
164
+ loop_count = 0
165
+ while loop_count < 5 do
166
+ fill_in("username", with: ENV["INSTAGRAM_USER_NAME"])
167
+ fill_in("password", with: ENV["INSTAGRAM_PASSWORD"])
168
+
169
+ begin
170
+ click_button("Log in", exact_text: true) # Note: "Log in" (lowercase `in`) instead redirects to Facebook's login page
171
+ rescue Capybara::ElementNotFound; end # If we can't find it don't break horribly, just keep waiting
172
+
173
+ break unless has_css?('p[data-testid="login-error-message"', wait: 10)
174
+ loop_count += 1
175
+ sleep(rand * 10.3)
176
+ end
177
+
178
+ # Sometimes Instagram just... doesn't let you log in
179
+ raise "Instagram not accessible" if loop_count == 5
180
+
181
+ # No we don't want to save our login credentials
182
+ begin
183
+ click_on("Save Info")
184
+ rescue Capybara::ElementNotFound; end
185
+ end
186
+
187
+ def fetch_image(url)
188
+ request = Typhoeus::Request.new(url, followlocation: true)
189
+ request.on_complete do |response|
190
+ if request.success?
191
+ return request.body
192
+ elsif request.timed_out?
193
+ raise Zorki::Error("Fetching image at #{url} timed out")
194
+ else
195
+ raise Zorki::Error("Fetching image at #{url} returned non-successful HTTP server response #{request.code}")
196
+ end
197
+ end
198
+ end
199
+
200
+ # Convert a string to an integer
201
+ def number_string_to_integer(number_string)
202
+ # First we have to remove any commas in the number or else it all breaks
203
+ number_string = number_string.delete(",")
204
+ # Is the last digit not a number? If so, we're going to have to multiply it by some multiplier
205
+ should_expand = /[0-9]/.match(number_string[-1, 1]).nil?
206
+
207
+ # Get the last index and remove the letter at the end if we should expand
208
+ last_index = should_expand ? number_string.length - 1 : number_string.length
209
+ number = number_string[0, last_index].to_f
210
+ multiplier = 1
211
+ # Determine the multiplier depending on the letter indicated
212
+ case number_string[-1, 1]
213
+ when "m"
214
+ multiplier = 1_000_000
215
+ end
216
+
217
+ # Multiply everything and insure we get an integer back
218
+ (number * multiplier).to_i
219
+ end
220
+ end
221
+ end
222
+
223
+ # require_relative "tweet_scraper"
@@ -0,0 +1,112 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "typhoeus"
4
+ require_relative "scraper"
5
+
6
+ module Birdsong
7
+ class TweetScraper < Scraper
8
+ def parse(id)
9
+ # Stuff we need to get from the DOM (implemented is starred):
10
+ # - User *
11
+ # - Text *
12
+ # - Image * / Images * / Video *
13
+ # - Date *
14
+ # - Number of likes *
15
+ # - Hashtags
16
+
17
+ Capybara.app_host = "https://twitter.com"
18
+
19
+ # video slideshows https://www.instagram.com/p/CY7KxwYOFBS/?utm_source=ig_embed&utm_campaign=loading
20
+ # login
21
+ graphql_object = get_content_of_subpage_from_url(
22
+ "https://twitter.com/jack/status/#{id}",
23
+ "/graphql",
24
+ "data,tweetResult,result"
25
+ )
26
+
27
+ graphql_object = graphql_object.first if graphql_object.kind_of?(Array)
28
+ graphql_object = graphql_object["data"]["tweetResult"]["result"]
29
+
30
+ if graphql_object.key?("__typename") && graphql_object["__typename"] == "TweetUnavailable"
31
+ raise Birdsong::NoTweetFoundError
32
+ end
33
+
34
+ text = graphql_object["legacy"]["full_text"]
35
+ date = graphql_object["legacy"]["created_at"]
36
+ id = graphql_object["legacy"]["id_str"]
37
+ number_of_likes = graphql_object["legacy"]["favorite_count"]
38
+ language = graphql_object["legacy"]["lang"]
39
+
40
+ images = []
41
+ videos = []
42
+ video_preview_image = nil
43
+ video_file_type = nil
44
+
45
+ if graphql_object["legacy"]["entities"].key?("media")
46
+ graphql_object["legacy"]["entities"]["media"].each do |media|
47
+ case media["type"]
48
+ when "photo"
49
+ images << Birdsong.retrieve_media(media["media_url_https"])
50
+ when "video"
51
+ video_preview_image = Birdsong.retrieve_media(media["media_url_https"])
52
+ video_variants = media["video_info"]["variants"]
53
+ largest_bitrate_variant = video_variants.sort_by do |variant|
54
+ variant["bitrate"].nil? ? 0 : variant["bitrate"]
55
+ end.last
56
+
57
+ videos << Birdsong.retrieve_media(largest_bitrate_variant["url"])
58
+ video_file_type = "video"
59
+ when "animated_gif"
60
+ video_preview_image = Birdsong.retrieve_media(media["media_url_https"])
61
+ videos << media["video_info"]["variants"].first["url"]
62
+ video_file_type = "animated_gif"
63
+ end
64
+ end
65
+ end
66
+
67
+ screenshot_file = take_screenshot()
68
+
69
+ # This has to run last since it switches pages
70
+ user_object = graphql_object["core"]["user_results"]["result"]
71
+ user = {
72
+ id: user_object["id"],
73
+ name: user_object["legacy"]["name"],
74
+ username: user_object["legacy"]["screen_name"],
75
+ sign_up_date: user_object["legacy"]["created_at"],
76
+ location: user_object["legacy"]["location"],
77
+ profile_image_url: user_object["legacy"]["profile_image_url_https"],
78
+ description: user_object["legacy"]["description"],
79
+ followers_count: user_object["legacy"]["followers_count"],
80
+ following_count: user_object["legacy"]["friends_count"],
81
+ tweet_count: user_object["legacy"]["statuses_count"],
82
+ listed_count: user_object["legacy"]["listed_count"],
83
+ verified: user_object["legacy"]["verified"],
84
+ url: user_object["legacy"]["url"],
85
+ profile_image_file_name: Birdsong.retrieve_media(user_object["legacy"]["profile_image_url_https"])
86
+ }
87
+
88
+ page.quit
89
+
90
+ {
91
+ images: images,
92
+ video: videos,
93
+ video_preview_image: video_preview_image,
94
+ screenshot_file: screenshot_file,
95
+ text: text,
96
+ date: date,
97
+ number_of_likes: number_of_likes,
98
+ user: user,
99
+ id: id,
100
+ language: language,
101
+ video_file_type: video_file_type
102
+ }
103
+ end
104
+
105
+ def take_screenshot
106
+ # First check if a post has a fact check overlay, if so, clear it.
107
+ # The only issue is that this can take *awhile* to search. Not sure what to do about that
108
+ # since it's Instagram's fault for having such a fucked up obfuscated hierarchy # Take the screenshot and return it
109
+ save_screenshot("#{Birdsong.temp_storage_location}/instagram_screenshot_#{SecureRandom.uuid}.png")
110
+ end
111
+ end
112
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Birdsong
4
- VERSION = "0.2.1"
4
+ VERSION = "0.2.3"
5
5
  end
data/lib/birdsong.rb CHANGED
@@ -10,7 +10,7 @@ require "fileutils"
10
10
  require_relative "birdsong/version"
11
11
  require_relative "birdsong/tweet"
12
12
  require_relative "birdsong/user"
13
- # require_relative "birdsong/scrapers/scraper"
13
+ require_relative "birdsong/scrapers/scraper"
14
14
  require_relative "birdsong/scrapers/tweet_scraper"
15
15
 
16
16
  require_relative "birdsong/monkeypatch"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: birdsong
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Christopher Guess
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-10-04 00:00:00.000000000 Z
11
+ date: 2023-10-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: typhoeus
@@ -172,6 +172,9 @@ files:
172
172
  - bin/setup
173
173
  - birdsong.gemspec
174
174
  - lib/birdsong.rb
175
+ - lib/birdsong/monkeypatch.rb
176
+ - lib/birdsong/scrapers/scraper.rb
177
+ - lib/birdsong/scrapers/tweet_scraper.rb
175
178
  - lib/birdsong/tweet.rb
176
179
  - lib/birdsong/user.rb
177
180
  - lib/birdsong/version.rb