birdsong 0.2.8 → 0.2.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.ruby-version +1 -0
- data/Gemfile.lock +27 -49
- data/birdsong.gemspec +1 -2
- data/lib/birdsong/scrapers/scraper.rb +42 -56
- data/lib/birdsong/scrapers/tweet_scraper.rb +8 -18
- data/lib/birdsong/version.rb +1 -1
- metadata +5 -18
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d9a2c2815cf9cbc7d98e308971c1924f90bba4d2715d7ded7c4e275f523ec84f
|
4
|
+
data.tar.gz: cb888671a2171786b5b7c68c96b4d1e0a8957a1d44ce3a1897b4f0fde43fad8b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 42d22a774b5cc9d1e2518911aaf4e3e7f84e5731d8662103d7e5df48ae71c25c0f723efaf334b60cfb3b18274cebce8119d88765a06f7b6957626506e5abcd74
|
7
|
+
data.tar.gz: 402eafa8cfd7873e5284ded6fce36a050ea8fa5055a240c6fe460e17bd92ea7d55637b8052bff5afd1fc110f66a16f97360c75d5b47249b25d0d974df230ea5b
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
3.0.5
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
birdsong (0.2.
|
4
|
+
birdsong (0.2.7)
|
5
5
|
capybara (~> 3.40)
|
6
6
|
curb (~> 1.0, >= 1.0.5)
|
7
7
|
oauth (~> 0.5.6)
|
@@ -13,24 +13,25 @@ PATH
|
|
13
13
|
GEM
|
14
14
|
remote: https://rubygems.org/
|
15
15
|
specs:
|
16
|
-
activesupport (7.1.
|
16
|
+
activesupport (7.1.5)
|
17
17
|
base64
|
18
|
+
benchmark (>= 0.3)
|
18
19
|
bigdecimal
|
19
20
|
concurrent-ruby (~> 1.0, >= 1.0.2)
|
20
21
|
connection_pool (>= 2.2.5)
|
21
22
|
drb
|
22
23
|
i18n (>= 1.6, < 2)
|
24
|
+
logger (>= 1.4.2)
|
23
25
|
minitest (>= 5.1)
|
24
26
|
mutex_m
|
27
|
+
securerandom (>= 0.3)
|
25
28
|
tzinfo (~> 2.0)
|
26
29
|
addressable (2.8.7)
|
27
30
|
public_suffix (>= 2.0.2, < 7.0)
|
28
31
|
ast (2.4.2)
|
29
|
-
backport (1.2.0)
|
30
32
|
base64 (0.2.0)
|
31
33
|
benchmark (0.3.0)
|
32
34
|
bigdecimal (3.1.8)
|
33
|
-
byebug (11.1.3)
|
34
35
|
capybara (3.40.0)
|
35
36
|
addressable
|
36
37
|
matrix
|
@@ -43,23 +44,20 @@ GEM
|
|
43
44
|
concurrent-ruby (1.3.4)
|
44
45
|
connection_pool (2.4.1)
|
45
46
|
curb (1.0.6)
|
46
|
-
|
47
|
-
|
47
|
+
debug (1.7.0)
|
48
|
+
irb (>= 1.5.0)
|
49
|
+
reline (>= 0.3.1)
|
50
|
+
dotenv (3.1.4)
|
48
51
|
drb (2.2.1)
|
49
|
-
e2mmap (0.1.0)
|
50
52
|
ethon (0.16.0)
|
51
53
|
ffi (>= 1.15.0)
|
52
|
-
ffi (1.17.0
|
53
|
-
ffi (1.17.0-x86_64-darwin)
|
54
|
-
ffi (1.17.0-x86_64-linux-gnu)
|
54
|
+
ffi (1.17.0)
|
55
55
|
i18n (1.14.6)
|
56
56
|
concurrent-ruby (~> 1.0)
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
kramdown-parser-gfm (1.1.0)
|
62
|
-
kramdown (~> 2.0)
|
57
|
+
io-console (0.6.0)
|
58
|
+
irb (1.6.2)
|
59
|
+
reline (>= 0.3.0)
|
60
|
+
json (2.7.5)
|
63
61
|
language_server-protocol (3.17.0.3)
|
64
62
|
logger (1.6.1)
|
65
63
|
matrix (0.4.2)
|
@@ -78,22 +76,21 @@ GEM
|
|
78
76
|
ostruct (>= 0.2)
|
79
77
|
ostruct (0.6.0)
|
80
78
|
parallel (1.26.3)
|
81
|
-
parser (3.3.5.
|
79
|
+
parser (3.3.5.1)
|
82
80
|
ast (~> 2.4.1)
|
83
81
|
racc
|
84
82
|
public_suffix (6.0.1)
|
85
83
|
racc (1.8.1)
|
86
|
-
rack (3.1.
|
84
|
+
rack (3.1.8)
|
87
85
|
rack-test (2.1.0)
|
88
86
|
rack (>= 1.3)
|
89
87
|
rainbow (3.1.1)
|
90
88
|
rake (13.2.1)
|
91
|
-
rbs (2.8.4)
|
92
89
|
regexp_parser (2.9.2)
|
93
|
-
|
94
|
-
|
95
|
-
rexml (3.3.
|
96
|
-
rubocop (1.
|
90
|
+
reline (0.3.2)
|
91
|
+
io-console (~> 0.5)
|
92
|
+
rexml (3.3.9)
|
93
|
+
rubocop (1.68.0)
|
97
94
|
json (~> 2.3)
|
98
95
|
language_server-protocol (>= 3.17.0)
|
99
96
|
parallel (~> 1.10)
|
@@ -103,9 +100,9 @@ GEM
|
|
103
100
|
rubocop-ast (>= 1.32.2, < 2.0)
|
104
101
|
ruby-progressbar (~> 1.7)
|
105
102
|
unicode-display_width (>= 2.4.0, < 3.0)
|
106
|
-
rubocop-ast (1.
|
103
|
+
rubocop-ast (1.33.0)
|
107
104
|
parser (>= 3.3.1.0)
|
108
|
-
rubocop-md (1.2.
|
105
|
+
rubocop-md (1.2.4)
|
109
106
|
rubocop (>= 1.45)
|
110
107
|
rubocop-minitest (0.36.0)
|
111
108
|
rubocop (>= 1.61, < 2.0)
|
@@ -115,7 +112,7 @@ GEM
|
|
115
112
|
rubocop-performance (1.22.1)
|
116
113
|
rubocop (>= 1.48.1, < 2.0)
|
117
114
|
rubocop-ast (>= 1.31.1, < 2.0)
|
118
|
-
rubocop-rails (2.
|
115
|
+
rubocop-rails (2.27.0)
|
119
116
|
activesupport (>= 4.2.0)
|
120
117
|
rack (>= 1.1)
|
121
118
|
rubocop (>= 1.52.0, < 2.0)
|
@@ -130,32 +127,15 @@ GEM
|
|
130
127
|
rubocop-rails (~> 2.0)
|
131
128
|
ruby-progressbar (1.13.0)
|
132
129
|
rubyzip (2.3.2)
|
133
|
-
|
130
|
+
securerandom (0.3.1)
|
131
|
+
selenium-devtools (0.130.0)
|
134
132
|
selenium-webdriver (~> 4.2)
|
135
|
-
selenium-webdriver (4.
|
133
|
+
selenium-webdriver (4.26.0)
|
136
134
|
base64 (~> 0.2)
|
137
135
|
logger (~> 1.4)
|
138
136
|
rexml (~> 3.2, >= 3.2.5)
|
139
137
|
rubyzip (>= 1.2.2, < 3.0)
|
140
138
|
websocket (~> 1.0)
|
141
|
-
solargraph (0.50.0)
|
142
|
-
backport (~> 1.2)
|
143
|
-
benchmark
|
144
|
-
bundler (~> 2.0)
|
145
|
-
diff-lcs (~> 1.4)
|
146
|
-
e2mmap
|
147
|
-
jaro_winkler (~> 1.5)
|
148
|
-
kramdown (~> 2.3)
|
149
|
-
kramdown-parser-gfm (~> 1.1)
|
150
|
-
parser (~> 3.0)
|
151
|
-
rbs (~> 2.0)
|
152
|
-
reverse_markdown (~> 2.0)
|
153
|
-
rubocop (~> 1.38)
|
154
|
-
thor (~> 1.0)
|
155
|
-
tilt (~> 2.0)
|
156
|
-
yard (~> 0.9, >= 0.9.24)
|
157
|
-
thor (1.3.2)
|
158
|
-
tilt (2.4.0)
|
159
139
|
typhoeus (1.4.1)
|
160
140
|
ethon (>= 0.9.0)
|
161
141
|
tzinfo (2.0.6)
|
@@ -164,7 +144,6 @@ GEM
|
|
164
144
|
websocket (1.2.11)
|
165
145
|
xpath (3.2.0)
|
166
146
|
nokogiri (~> 1.8)
|
167
|
-
yard (0.9.37)
|
168
147
|
|
169
148
|
PLATFORMS
|
170
149
|
arm64-darwin-21
|
@@ -175,7 +154,7 @@ PLATFORMS
|
|
175
154
|
|
176
155
|
DEPENDENCIES
|
177
156
|
birdsong!
|
178
|
-
|
157
|
+
debug
|
179
158
|
dotenv
|
180
159
|
minitest
|
181
160
|
rake
|
@@ -183,7 +162,6 @@ DEPENDENCIES
|
|
183
162
|
rubocop-performance
|
184
163
|
rubocop-rails
|
185
164
|
rubocop-rails_config
|
186
|
-
solargraph
|
187
165
|
|
188
166
|
BUNDLED WITH
|
189
167
|
2.3.26
|
data/birdsong.gemspec
CHANGED
@@ -39,7 +39,7 @@ Gem::Specification.new do |spec|
|
|
39
39
|
spec.add_dependency "selenium-devtools"
|
40
40
|
|
41
41
|
# Dev dependencies
|
42
|
-
spec.add_development_dependency "
|
42
|
+
spec.add_development_dependency "debug"
|
43
43
|
spec.add_development_dependency "rake"
|
44
44
|
spec.add_development_dependency "minitest"
|
45
45
|
spec.add_development_dependency "rubocop"
|
@@ -47,7 +47,6 @@ Gem::Specification.new do |spec|
|
|
47
47
|
spec.add_development_dependency "rubocop-rails_config"
|
48
48
|
spec.add_development_dependency "rubocop-performance"
|
49
49
|
spec.add_development_dependency "dotenv"
|
50
|
-
spec.add_development_dependency "solargraph"
|
51
50
|
|
52
51
|
# For more information and examples about making a new gem, checkout our
|
53
52
|
# guide at: https://bundler.io/guides/creating_gem.html
|
@@ -67,11 +67,14 @@ module Birdsong
|
|
67
67
|
|
68
68
|
page.driver.browser.intercept do |request, &continue|
|
69
69
|
# This passes the request forward unmodified, since we only care about the response
|
70
|
+
# puts "checking request: #{request.url}"
|
71
|
+
|
70
72
|
continue.call(request) && next unless request.url.include?(subpage_search)
|
71
73
|
|
74
|
+
|
72
75
|
continue.call(request) do |response|
|
76
|
+
|
73
77
|
# Check if not a CORS prefetch and finish up if not
|
74
|
-
# puts "checking request: #{request.url}"
|
75
78
|
if !response.body.empty? && response.body
|
76
79
|
check_passed = true
|
77
80
|
unless additional_search_parameters.nil?
|
@@ -107,7 +110,6 @@ module Birdsong
|
|
107
110
|
|
108
111
|
page.driver.execute_script("window.stop();")
|
109
112
|
|
110
|
-
# debugger if response_body.nil?
|
111
113
|
raise Birdsong::NoTweetFoundError if response_body.nil?
|
112
114
|
Oj.load(response_body)
|
113
115
|
rescue Birdsong::WebDriverError
|
@@ -141,65 +143,49 @@ module Birdsong
|
|
141
143
|
Capybara.current_driver = :selenium
|
142
144
|
end
|
143
145
|
|
144
|
-
def login
|
145
|
-
|
146
|
-
|
146
|
+
# def login
|
147
|
+
# # Reset the sessions so that there's nothing laying around
|
148
|
+
# page.quit
|
147
149
|
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
150
|
+
# # Check if we're on a Instagram page already, if not visit it.
|
151
|
+
# unless page.driver.browser.current_url.include? "twitter.com" || page.driver.browser.current_url.include? "x.com"
|
152
|
+
# # There seems to be a bug in the Linux ARM64 version of chromedriver where this will properly
|
153
|
+
# # navigate but then timeout, crashing it all up. So instead we check and raise the error when
|
154
|
+
# # that then fails again.
|
155
|
+
# page.driver.browser.navigate.to("https://x.com")
|
156
|
+
# end
|
155
157
|
|
156
|
-
# We don't have to login if we already are
|
157
|
-
begin
|
158
|
-
|
159
|
-
rescue Capybara::ElementNotFound; end
|
160
|
-
|
161
|
-
page.driver.browser.find_element(link_text: "Sign in").click # Check if we're redirected to a login page, if we aren't we're already logged in
|
158
|
+
# # We don't have to login if we already are
|
159
|
+
# begin
|
160
|
+
# return if find_field("Search", wait: 10).present?
|
161
|
+
# rescue Capybara::ElementNotFound; end
|
162
162
|
|
163
|
+
# # Check if we're redirected to a login page, if we aren't we're already logged in
|
163
164
|
# return unless page.has_xpath?('//*[@id="loginForm"]/div/div[3]/button')
|
164
165
|
|
165
|
-
# Try to log in
|
166
|
-
loop_count = 0
|
167
|
-
while loop_count < 5 do
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
rescue Capybara::ElementNotFound; end # If we can't find it don't break horribly, just keep waiting
|
189
|
-
|
190
|
-
break unless has_css?('p[data-testid="login-error-message"', wait: 10)
|
191
|
-
loop_count += 1
|
192
|
-
sleep(rand * 10.3)
|
193
|
-
end
|
194
|
-
|
195
|
-
# Sometimes Twitter just... doesn't let you log in
|
196
|
-
raise "Twitter not accessible" if loop_count == 5
|
197
|
-
|
198
|
-
# No we don't want to save our login credentials
|
199
|
-
begin
|
200
|
-
click_on("Save Info")
|
201
|
-
rescue Capybara::ElementNotFound; end
|
202
|
-
end
|
166
|
+
# # Try to log in
|
167
|
+
# loop_count = 0
|
168
|
+
# while loop_count < 5 do
|
169
|
+
# fill_in("username", with: ENV["TWITTER_USER_NAME"])
|
170
|
+
# fill_in("password", with: ENV["TWITTER_PASSWORD"])
|
171
|
+
|
172
|
+
# begin
|
173
|
+
# click_button("Log in", exact_text: true) # Note: "Log in" (lowercase `in`) instead redirects to Facebook's login page
|
174
|
+
# rescue Capybara::ElementNotFound; end # If we can't find it don't break horribly, just keep waiting
|
175
|
+
|
176
|
+
# break unless has_css?('p[data-testid="login-error-message"', wait: 10)
|
177
|
+
# loop_count += 1
|
178
|
+
# sleep(rand * 10.3)
|
179
|
+
# end
|
180
|
+
|
181
|
+
# # Sometimes Instagram just... doesn't let you log in
|
182
|
+
# raise "Instagram not accessible" if loop_count == 5
|
183
|
+
|
184
|
+
# # No we don't want to save our login credentials
|
185
|
+
# begin
|
186
|
+
# click_on("Save Info")
|
187
|
+
# rescue Capybara::ElementNotFound; end
|
188
|
+
# end
|
203
189
|
|
204
190
|
def fetch_image(url)
|
205
191
|
request = Typhoeus::Request.new(url, followlocation: true)
|
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
require "typhoeus"
|
4
4
|
require_relative "scraper"
|
5
|
+
require "debug"
|
5
6
|
|
6
7
|
module Birdsong
|
7
8
|
class TweetScraper < Scraper
|
@@ -20,31 +21,20 @@ module Birdsong
|
|
20
21
|
# login
|
21
22
|
graphql_object = get_content_of_subpage_from_url(
|
22
23
|
"https://x.com/jack/status/#{id}",
|
23
|
-
"/
|
24
|
+
"/graphql",
|
24
25
|
"data,tweetResult,result"
|
25
26
|
)
|
26
27
|
|
27
28
|
graphql_object = graphql_object.first if graphql_object.kind_of?(Array)
|
28
29
|
graphql_object = graphql_object["data"]["tweetResult"]["result"]
|
29
30
|
|
30
|
-
raise Birdsong::NoTweetFoundError if graphql_object.nil?
|
31
|
-
|
32
31
|
if graphql_object.key?("__typename") && graphql_object["__typename"] == "TweetUnavailable"
|
33
|
-
raise Birdsong::NoTweetFoundError
|
34
|
-
@@logger.info "Post is tagged NSFW, logging in to access..."
|
35
|
-
# Let's login and start this over?
|
36
|
-
login
|
37
|
-
@@logger.info "Logged in, retrying post..."
|
38
|
-
|
39
|
-
graphql_object = get_content_of_subpage_from_url(
|
40
|
-
"https://x.com/jack/status/#{id}",
|
41
|
-
"/TweetDetail"
|
42
|
-
)
|
43
|
-
|
44
|
-
# The format gets weird for this request
|
45
|
-
graphql_object = graphql_object["data"]["threaded_conversation_with_injections_v2"]["instructions"][0]["entries"][0]["content"]["itemContent"]["tweet_results"]["result"]["tweet"]
|
32
|
+
raise Birdsong::NoTweetFoundError
|
46
33
|
end
|
47
34
|
|
35
|
+
# Certain types of tweets are wrapped in a "tweet" object
|
36
|
+
graphql_object = graphql_object["tweet"] if graphql_object.key?("tweet")
|
37
|
+
|
48
38
|
text = graphql_object["legacy"]["full_text"]
|
49
39
|
date = graphql_object["legacy"]["created_at"]
|
50
40
|
id = graphql_object["legacy"]["id_str"]
|
@@ -65,7 +55,7 @@ module Birdsong
|
|
65
55
|
video_preview_image = Birdsong.retrieve_media(media["media_url_https"])
|
66
56
|
video_variants = media["video_info"]["variants"]
|
67
57
|
largest_bitrate_variant = video_variants.sort_by do |variant|
|
68
|
-
variant
|
58
|
+
variant["bitrate"].nil? ? 0 : variant["bitrate"]
|
69
59
|
end.last
|
70
60
|
|
71
61
|
videos << Birdsong.retrieve_media(largest_bitrate_variant["url"])
|
@@ -122,7 +112,7 @@ module Birdsong
|
|
122
112
|
# since it's Instagram's fault for having such a fucked up obfuscated hierarchy # Take the screenshot and return it
|
123
113
|
# rubocop:disable Lint/Debugger
|
124
114
|
save_screenshot("#{Birdsong.temp_storage_location}/instagram_screenshot_#{SecureRandom.uuid}.png")
|
125
|
-
# rubocop:enable
|
115
|
+
# rubocop:enable Link/Debugger
|
126
116
|
end
|
127
117
|
end
|
128
118
|
end
|
data/lib/birdsong/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: birdsong
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Christopher Guess
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-11-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: typhoeus
|
@@ -127,7 +127,7 @@ dependencies:
|
|
127
127
|
- !ruby/object:Gem::Version
|
128
128
|
version: '0'
|
129
129
|
- !ruby/object:Gem::Dependency
|
130
|
-
name:
|
130
|
+
name: debug
|
131
131
|
requirement: !ruby/object:Gem::Requirement
|
132
132
|
requirements:
|
133
133
|
- - ">="
|
@@ -238,20 +238,6 @@ dependencies:
|
|
238
238
|
- - ">="
|
239
239
|
- !ruby/object:Gem::Version
|
240
240
|
version: '0'
|
241
|
-
- !ruby/object:Gem::Dependency
|
242
|
-
name: solargraph
|
243
|
-
requirement: !ruby/object:Gem::Requirement
|
244
|
-
requirements:
|
245
|
-
- - ">="
|
246
|
-
- !ruby/object:Gem::Version
|
247
|
-
version: '0'
|
248
|
-
type: :development
|
249
|
-
prerelease: false
|
250
|
-
version_requirements: !ruby/object:Gem::Requirement
|
251
|
-
requirements:
|
252
|
-
- - ">="
|
253
|
-
- !ruby/object:Gem::Version
|
254
|
-
version: '0'
|
255
241
|
description:
|
256
242
|
email:
|
257
243
|
- cguess@gmail.com
|
@@ -262,6 +248,7 @@ files:
|
|
262
248
|
- ".github/workflows/main.yml"
|
263
249
|
- ".gitignore"
|
264
250
|
- ".rubocop.yml"
|
251
|
+
- ".ruby-version"
|
265
252
|
- CHANGELOG.md
|
266
253
|
- CODE_OF_CONDUCT.md
|
267
254
|
- Gemfile
|
@@ -301,7 +288,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
301
288
|
- !ruby/object:Gem::Version
|
302
289
|
version: '0'
|
303
290
|
requirements: []
|
304
|
-
rubygems_version: 3.
|
291
|
+
rubygems_version: 3.2.33
|
305
292
|
signing_key:
|
306
293
|
specification_version: 4
|
307
294
|
summary: A gem to interface with Twitter's API V2
|