forki 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/Gemfile.lock +3 -1
- data/lib/forki/scrapers/post_scraper.rb +56 -20
- data/lib/forki/scrapers/scraper.rb +9 -6
- data/lib/forki/scrapers/user_scraper.rb +8 -3
- data/lib/forki/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a0667614af238aeb8089c1af60794918c0986b7f5e12fa6fe96d33a2a5c1a06f
|
4
|
+
data.tar.gz: 5668a5a6056bdf9bdd9c9fd8f119fa5ed754c5ee0b152d2764b26e9f9d7a8804
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 509ffb30dada2666236ed0900e1def8a61413ee3ba2e8705e31bd01422f8d2baf2cd24479faf906b6374602fcb0c1dc7bb334b7e02a560c798eba6efab30a2c9
|
7
|
+
data.tar.gz: 3f79b804b5505222e06a49352807d92c95f13a8735bedcd1b32cbf079be72b0922052014ef82da8771db108f7b0b674cd132caf7d377084c57cf0d245eb049e1
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
forki (0.1.
|
4
|
+
forki (0.1.1)
|
5
5
|
apparition
|
6
6
|
capybara
|
7
7
|
oj
|
@@ -48,6 +48,7 @@ GEM
|
|
48
48
|
xpath (~> 3.2)
|
49
49
|
concurrent-ruby (1.2.2)
|
50
50
|
crass (1.0.6)
|
51
|
+
curb (1.0.5)
|
51
52
|
dotenv (2.7.6)
|
52
53
|
erubi (1.12.0)
|
53
54
|
ethon (0.16.0)
|
@@ -150,6 +151,7 @@ PLATFORMS
|
|
150
151
|
|
151
152
|
DEPENDENCIES
|
152
153
|
byebug
|
154
|
+
curb (~> 1.0, >= 1.0.5)
|
153
155
|
dotenv (~> 2.7.6)
|
154
156
|
forki!
|
155
157
|
minitest (~> 5.0)
|
@@ -62,8 +62,16 @@ module Forki
|
|
62
62
|
|
63
63
|
def check_if_post_is_image(graphql_objects)
|
64
64
|
graphql_objects.any? do |graphql_object| # if any GraphQL objects contain the top-level keys above, return true
|
65
|
-
true unless graphql_object.fetch("image", nil).nil? # so long as the associated values are not nil
|
66
|
-
true unless graphql_object.fetch("currMedia", nil).nil?
|
65
|
+
return true unless graphql_object.fetch("image", nil).nil? # so long as the associated values are not nil
|
66
|
+
return true unless graphql_object.fetch("currMedia", nil).nil?
|
67
|
+
|
68
|
+
# This is a complicated form for `web.facebook.com` posts
|
69
|
+
|
70
|
+
if !graphql_object.dig("node", "comet_sections", "content", "story", "attachments").nil?
|
71
|
+
if graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"].count.positive?
|
72
|
+
return true unless graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"].first.dig("styles", "attachment", "all_subattachments", "nodes")&.first&.dig("media", "image", "uri").nil?
|
73
|
+
end
|
74
|
+
end
|
67
75
|
end
|
68
76
|
end
|
69
77
|
|
@@ -220,26 +228,53 @@ module Forki
|
|
220
228
|
|
221
229
|
# Extracts data from an image post by parsing GraphQL strings as seen in the video post scraper above
|
222
230
|
def extract_image_post_data(graphql_object_array)
|
223
|
-
|
224
|
-
|
225
|
-
|
231
|
+
# This is a weird one-off style
|
232
|
+
graphql_object = graphql_object_array.find { |graphql_object| !graphql_object.dig("node", "comet_sections", "content", "story", "attachments").nil? }
|
233
|
+
unless graphql_object.nil? || graphql_object.count == 0
|
234
|
+
attachments = graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"]
|
235
|
+
|
236
|
+
reaction_counts = extract_reaction_counts(graphql_object["node"]["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]["comet_ufi_summary_and_actions_renderer"]["feedback"]["cannot_see_top_custom_reactions"]["top_reactions"])
|
237
|
+
id = graphql_object["node"]["post_id"]
|
238
|
+
num_comments = graphql_object["node"]["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]["comet_ufi_summary_and_actions_renderer"]["feedback"]["share_count"]["count"]
|
239
|
+
reshare_warning = graphql_object["node"]["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"]
|
240
|
+
image_url = attachments.first["styles"]["attachment"]["all_subattachments"]["nodes"].first["media"]["image"]["uri"]
|
241
|
+
text = graphql_object["node"]["comet_sections"]["content"]["story"]["message"]["text"]
|
242
|
+
profile_link = graphql_object["node"]["comet_sections"]["content"]["story"]["actors"].first["url"]
|
243
|
+
created_at = graphql_object["node"]["comet_sections"]["content"]["story"]["comet_sections"]["context_layout"]["story"]["comet_sections"]["metadata"].first["story"]["creation_time"]
|
244
|
+
has_video = false
|
245
|
+
else
|
226
246
|
|
227
|
-
|
228
|
-
|
247
|
+
graphql_object_array.find { |graphql_object| graphql_object.key?("viewer_actor") && graphql_object.key?("display_comments") }
|
248
|
+
curr_media_object = graphql_object_array.find { |graphql_object| graphql_object.key?("currMedia") }
|
249
|
+
creation_story_object = graphql_object_array.find { |graphql_object| graphql_object.key?("creation_story") && graphql_object.key?("message") }
|
250
|
+
|
251
|
+
feedback_object = graphql_object_array.find { |graphql_object| graphql_object.has_key?("comet_ufi_summary_and_actions_renderer") }["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
252
|
+
share_count_object = feedback_object.fetch("share_count", {})
|
253
|
+
|
254
|
+
poster = creation_story_object["creation_story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]
|
229
255
|
|
230
|
-
|
256
|
+
reaction_counts = extract_reaction_counts(feedback_object["cannot_see_top_custom_reactions"]["top_reactions"])
|
257
|
+
id = curr_media_object["currMedia"]["id"],
|
258
|
+
num_comments = feedback_object["comments_count_summary_renderer"]["feedback"]["total_comment_count"],
|
259
|
+
num_shares = share_count_object.fetch("count", nil),
|
260
|
+
reshare_warning = feedback_object["should_show_reshare_warning"],
|
261
|
+
image_url = curr_media_object["currMedia"]["image"]["uri"],
|
262
|
+
text = (creation_story_object["message"] || {}).fetch("text", nil),
|
263
|
+
profile_link = poster["url"],
|
264
|
+
created_at = curr_media_object["currMedia"]["created_time"],
|
265
|
+
has_video = false
|
231
266
|
|
232
|
-
|
267
|
+
end
|
233
268
|
post_details = {
|
234
|
-
id:
|
235
|
-
num_comments:
|
236
|
-
num_shares:
|
237
|
-
reshare_warning:
|
238
|
-
image_url:
|
239
|
-
text:
|
240
|
-
profile_link:
|
241
|
-
created_at:
|
242
|
-
has_video:
|
269
|
+
id: id,
|
270
|
+
num_comments: num_comments,
|
271
|
+
num_shares: num_shares,
|
272
|
+
reshare_warning: reshare_warning,
|
273
|
+
image_url: image_url,
|
274
|
+
text: text,
|
275
|
+
profile_link: profile_link,
|
276
|
+
created_at: created_at,
|
277
|
+
has_video: has_video
|
243
278
|
}
|
244
279
|
post_details[:image_file] = Forki.retrieve_media(post_details[:image_url])
|
245
280
|
post_details[:reactions] = reaction_counts
|
@@ -349,8 +384,9 @@ module Forki
|
|
349
384
|
page.quit
|
350
385
|
|
351
386
|
post_data
|
352
|
-
rescue Net::ReadTimeout
|
353
|
-
|
387
|
+
rescue Net::ReadTimeout => e
|
388
|
+
puts "Time out error: #{e}"
|
389
|
+
puts e.backtrace
|
354
390
|
rescue StandardError => e
|
355
391
|
raise e
|
356
392
|
ensure
|
@@ -6,6 +6,7 @@ require "dotenv/load"
|
|
6
6
|
require "oj"
|
7
7
|
require "selenium-webdriver"
|
8
8
|
require "open-uri"
|
9
|
+
require "selenium/webdriver/remote/http/curb"
|
9
10
|
|
10
11
|
options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
|
11
12
|
options.add_argument("--start-maximized")
|
@@ -20,8 +21,8 @@ options.add_argument("--remote-debugging-port=9222")
|
|
20
21
|
options.add_argument("--user-data-dir=/tmp/tarun_forki_#{SecureRandom.uuid}")
|
21
22
|
|
22
23
|
Capybara.register_driver :selenium_forki do |app|
|
23
|
-
client = Selenium::WebDriver::Remote::Http::
|
24
|
-
client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
24
|
+
client = Selenium::WebDriver::Remote::Http::Curb.new
|
25
|
+
# client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
25
26
|
Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
|
26
27
|
end
|
27
28
|
|
@@ -97,8 +98,8 @@ module Forki
|
|
97
98
|
options.add_argument("--user-data-dir=/tmp/tarun_forki_#{SecureRandom.uuid}")
|
98
99
|
|
99
100
|
Capybara.register_driver :selenium_forki do |app|
|
100
|
-
client = Selenium::WebDriver::Remote::Http::
|
101
|
-
client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
101
|
+
client = Selenium::WebDriver::Remote::Http::Curb.new
|
102
|
+
# client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
102
103
|
Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
|
103
104
|
end
|
104
105
|
|
@@ -110,7 +111,9 @@ module Forki
|
|
110
111
|
raise MissingCredentialsError if ENV["FACEBOOK_EMAIL"].nil? || ENV["FACEBOOK_PASSWORD"].nil?
|
111
112
|
|
112
113
|
url ||= "https://www.facebook.com"
|
113
|
-
|
114
|
+
|
115
|
+
|
116
|
+
page.driver.browser.navigate.to(url) # Visit the url passed in or the facebook homepage if nothing is
|
114
117
|
|
115
118
|
# Look for "login_form" box, which throws an error if not found. So we catch it and run the rest of the tests
|
116
119
|
begin
|
@@ -120,7 +123,7 @@ module Forki
|
|
120
123
|
end
|
121
124
|
|
122
125
|
# Since we're not logged in, let's do that quick
|
123
|
-
|
126
|
+
page.driver.browser.navigate.to("https://www.facebook.com") if login_form.nil?
|
124
127
|
|
125
128
|
login_form.fill_in("email", with: ENV["FACEBOOK_EMAIL"])
|
126
129
|
login_form.fill_in("pass", with: ENV["FACEBOOK_PASSWORD"])
|
@@ -39,9 +39,14 @@ module Forki
|
|
39
39
|
profile_title_section = graphql_strings.find { |gql| gql.include? "profile_tile_section_type" }
|
40
40
|
|
41
41
|
json = JSON.parse(profile_title_section)
|
42
|
-
|
43
|
-
|
44
|
-
|
42
|
+
|
43
|
+
followers_node = []
|
44
|
+
begin
|
45
|
+
followers_node = json["user"]["profile_tile_sections"]["edges"].first["node"]["profile_tile_views"]["nodes"][1]["view_style_renderer"]["view"]["profile_tile_items"]["nodes"].select do |node|
|
46
|
+
node["node"]["timeline_context_item"]["timeline_context_list_item_type"] == "INTRO_CARD_FOLLOWERS"
|
47
|
+
end
|
48
|
+
rescue NoMethodError; end
|
49
|
+
|
45
50
|
if followers_node.empty?
|
46
51
|
number_of_followers = nil
|
47
52
|
else
|
data/lib/forki/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: forki
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ''
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-05
|
11
|
+
date: 2023-06-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|