forki 0.1.1 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/Gemfile.lock +3 -1
- data/lib/forki/scrapers/post_scraper.rb +56 -20
- data/lib/forki/scrapers/scraper.rb +9 -6
- data/lib/forki/scrapers/user_scraper.rb +8 -3
- data/lib/forki/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a0667614af238aeb8089c1af60794918c0986b7f5e12fa6fe96d33a2a5c1a06f
|
4
|
+
data.tar.gz: 5668a5a6056bdf9bdd9c9fd8f119fa5ed754c5ee0b152d2764b26e9f9d7a8804
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 509ffb30dada2666236ed0900e1def8a61413ee3ba2e8705e31bd01422f8d2baf2cd24479faf906b6374602fcb0c1dc7bb334b7e02a560c798eba6efab30a2c9
|
7
|
+
data.tar.gz: 3f79b804b5505222e06a49352807d92c95f13a8735bedcd1b32cbf079be72b0922052014ef82da8771db108f7b0b674cd132caf7d377084c57cf0d245eb049e1
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
forki (0.1.
|
4
|
+
forki (0.1.1)
|
5
5
|
apparition
|
6
6
|
capybara
|
7
7
|
oj
|
@@ -48,6 +48,7 @@ GEM
|
|
48
48
|
xpath (~> 3.2)
|
49
49
|
concurrent-ruby (1.2.2)
|
50
50
|
crass (1.0.6)
|
51
|
+
curb (1.0.5)
|
51
52
|
dotenv (2.7.6)
|
52
53
|
erubi (1.12.0)
|
53
54
|
ethon (0.16.0)
|
@@ -150,6 +151,7 @@ PLATFORMS
|
|
150
151
|
|
151
152
|
DEPENDENCIES
|
152
153
|
byebug
|
154
|
+
curb (~> 1.0, >= 1.0.5)
|
153
155
|
dotenv (~> 2.7.6)
|
154
156
|
forki!
|
155
157
|
minitest (~> 5.0)
|
@@ -62,8 +62,16 @@ module Forki
|
|
62
62
|
|
63
63
|
def check_if_post_is_image(graphql_objects)
|
64
64
|
graphql_objects.any? do |graphql_object| # if any GraphQL objects contain the top-level keys above, return true
|
65
|
-
true unless graphql_object.fetch("image", nil).nil? # so long as the associated values are not nil
|
66
|
-
true unless graphql_object.fetch("currMedia", nil).nil?
|
65
|
+
return true unless graphql_object.fetch("image", nil).nil? # so long as the associated values are not nil
|
66
|
+
return true unless graphql_object.fetch("currMedia", nil).nil?
|
67
|
+
|
68
|
+
# This is a complicated form for `web.facebook.com` posts
|
69
|
+
|
70
|
+
if !graphql_object.dig("node", "comet_sections", "content", "story", "attachments").nil?
|
71
|
+
if graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"].count.positive?
|
72
|
+
return true unless graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"].first.dig("styles", "attachment", "all_subattachments", "nodes")&.first&.dig("media", "image", "uri").nil?
|
73
|
+
end
|
74
|
+
end
|
67
75
|
end
|
68
76
|
end
|
69
77
|
|
@@ -220,26 +228,53 @@ module Forki
|
|
220
228
|
|
221
229
|
# Extracts data from an image post by parsing GraphQL strings as seen in the video post scraper above
|
222
230
|
def extract_image_post_data(graphql_object_array)
|
223
|
-
|
224
|
-
|
225
|
-
|
231
|
+
# This is a weird one-off style
|
232
|
+
graphql_object = graphql_object_array.find { |graphql_object| !graphql_object.dig("node", "comet_sections", "content", "story", "attachments").nil? }
|
233
|
+
unless graphql_object.nil? || graphql_object.count == 0
|
234
|
+
attachments = graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"]
|
235
|
+
|
236
|
+
reaction_counts = extract_reaction_counts(graphql_object["node"]["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]["comet_ufi_summary_and_actions_renderer"]["feedback"]["cannot_see_top_custom_reactions"]["top_reactions"])
|
237
|
+
id = graphql_object["node"]["post_id"]
|
238
|
+
num_comments = graphql_object["node"]["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]["comet_ufi_summary_and_actions_renderer"]["feedback"]["share_count"]["count"]
|
239
|
+
reshare_warning = graphql_object["node"]["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"]
|
240
|
+
image_url = attachments.first["styles"]["attachment"]["all_subattachments"]["nodes"].first["media"]["image"]["uri"]
|
241
|
+
text = graphql_object["node"]["comet_sections"]["content"]["story"]["message"]["text"]
|
242
|
+
profile_link = graphql_object["node"]["comet_sections"]["content"]["story"]["actors"].first["url"]
|
243
|
+
created_at = graphql_object["node"]["comet_sections"]["content"]["story"]["comet_sections"]["context_layout"]["story"]["comet_sections"]["metadata"].first["story"]["creation_time"]
|
244
|
+
has_video = false
|
245
|
+
else
|
226
246
|
|
227
|
-
|
228
|
-
|
247
|
+
graphql_object_array.find { |graphql_object| graphql_object.key?("viewer_actor") && graphql_object.key?("display_comments") }
|
248
|
+
curr_media_object = graphql_object_array.find { |graphql_object| graphql_object.key?("currMedia") }
|
249
|
+
creation_story_object = graphql_object_array.find { |graphql_object| graphql_object.key?("creation_story") && graphql_object.key?("message") }
|
250
|
+
|
251
|
+
feedback_object = graphql_object_array.find { |graphql_object| graphql_object.has_key?("comet_ufi_summary_and_actions_renderer") }["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
252
|
+
share_count_object = feedback_object.fetch("share_count", {})
|
253
|
+
|
254
|
+
poster = creation_story_object["creation_story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]
|
229
255
|
|
230
|
-
|
256
|
+
reaction_counts = extract_reaction_counts(feedback_object["cannot_see_top_custom_reactions"]["top_reactions"])
|
257
|
+
id = curr_media_object["currMedia"]["id"],
|
258
|
+
num_comments = feedback_object["comments_count_summary_renderer"]["feedback"]["total_comment_count"],
|
259
|
+
num_shares = share_count_object.fetch("count", nil),
|
260
|
+
reshare_warning = feedback_object["should_show_reshare_warning"],
|
261
|
+
image_url = curr_media_object["currMedia"]["image"]["uri"],
|
262
|
+
text = (creation_story_object["message"] || {}).fetch("text", nil),
|
263
|
+
profile_link = poster["url"],
|
264
|
+
created_at = curr_media_object["currMedia"]["created_time"],
|
265
|
+
has_video = false
|
231
266
|
|
232
|
-
|
267
|
+
end
|
233
268
|
post_details = {
|
234
|
-
id:
|
235
|
-
num_comments:
|
236
|
-
num_shares:
|
237
|
-
reshare_warning:
|
238
|
-
image_url:
|
239
|
-
text:
|
240
|
-
profile_link:
|
241
|
-
created_at:
|
242
|
-
has_video:
|
269
|
+
id: id,
|
270
|
+
num_comments: num_comments,
|
271
|
+
num_shares: num_shares,
|
272
|
+
reshare_warning: reshare_warning,
|
273
|
+
image_url: image_url,
|
274
|
+
text: text,
|
275
|
+
profile_link: profile_link,
|
276
|
+
created_at: created_at,
|
277
|
+
has_video: has_video
|
243
278
|
}
|
244
279
|
post_details[:image_file] = Forki.retrieve_media(post_details[:image_url])
|
245
280
|
post_details[:reactions] = reaction_counts
|
@@ -349,8 +384,9 @@ module Forki
|
|
349
384
|
page.quit
|
350
385
|
|
351
386
|
post_data
|
352
|
-
rescue Net::ReadTimeout
|
353
|
-
|
387
|
+
rescue Net::ReadTimeout => e
|
388
|
+
puts "Time out error: #{e}"
|
389
|
+
puts e.backtrace
|
354
390
|
rescue StandardError => e
|
355
391
|
raise e
|
356
392
|
ensure
|
@@ -6,6 +6,7 @@ require "dotenv/load"
|
|
6
6
|
require "oj"
|
7
7
|
require "selenium-webdriver"
|
8
8
|
require "open-uri"
|
9
|
+
require "selenium/webdriver/remote/http/curb"
|
9
10
|
|
10
11
|
options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
|
11
12
|
options.add_argument("--start-maximized")
|
@@ -20,8 +21,8 @@ options.add_argument("--remote-debugging-port=9222")
|
|
20
21
|
options.add_argument("--user-data-dir=/tmp/tarun_forki_#{SecureRandom.uuid}")
|
21
22
|
|
22
23
|
Capybara.register_driver :selenium_forki do |app|
|
23
|
-
client = Selenium::WebDriver::Remote::Http::
|
24
|
-
client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
24
|
+
client = Selenium::WebDriver::Remote::Http::Curb.new
|
25
|
+
# client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
25
26
|
Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
|
26
27
|
end
|
27
28
|
|
@@ -97,8 +98,8 @@ module Forki
|
|
97
98
|
options.add_argument("--user-data-dir=/tmp/tarun_forki_#{SecureRandom.uuid}")
|
98
99
|
|
99
100
|
Capybara.register_driver :selenium_forki do |app|
|
100
|
-
client = Selenium::WebDriver::Remote::Http::
|
101
|
-
client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
101
|
+
client = Selenium::WebDriver::Remote::Http::Curb.new
|
102
|
+
# client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
102
103
|
Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
|
103
104
|
end
|
104
105
|
|
@@ -110,7 +111,9 @@ module Forki
|
|
110
111
|
raise MissingCredentialsError if ENV["FACEBOOK_EMAIL"].nil? || ENV["FACEBOOK_PASSWORD"].nil?
|
111
112
|
|
112
113
|
url ||= "https://www.facebook.com"
|
113
|
-
|
114
|
+
|
115
|
+
|
116
|
+
page.driver.browser.navigate.to(url) # Visit the url passed in or the facebook homepage if nothing is
|
114
117
|
|
115
118
|
# Look for "login_form" box, which throws an error if not found. So we catch it and run the rest of the tests
|
116
119
|
begin
|
@@ -120,7 +123,7 @@ module Forki
|
|
120
123
|
end
|
121
124
|
|
122
125
|
# Since we're not logged in, let's do that quick
|
123
|
-
|
126
|
+
page.driver.browser.navigate.to("https://www.facebook.com") if login_form.nil?
|
124
127
|
|
125
128
|
login_form.fill_in("email", with: ENV["FACEBOOK_EMAIL"])
|
126
129
|
login_form.fill_in("pass", with: ENV["FACEBOOK_PASSWORD"])
|
@@ -39,9 +39,14 @@ module Forki
|
|
39
39
|
profile_title_section = graphql_strings.find { |gql| gql.include? "profile_tile_section_type" }
|
40
40
|
|
41
41
|
json = JSON.parse(profile_title_section)
|
42
|
-
|
43
|
-
|
44
|
-
|
42
|
+
|
43
|
+
followers_node = []
|
44
|
+
begin
|
45
|
+
followers_node = json["user"]["profile_tile_sections"]["edges"].first["node"]["profile_tile_views"]["nodes"][1]["view_style_renderer"]["view"]["profile_tile_items"]["nodes"].select do |node|
|
46
|
+
node["node"]["timeline_context_item"]["timeline_context_list_item_type"] == "INTRO_CARD_FOLLOWERS"
|
47
|
+
end
|
48
|
+
rescue NoMethodError; end
|
49
|
+
|
45
50
|
if followers_node.empty?
|
46
51
|
number_of_followers = nil
|
47
52
|
else
|
data/lib/forki/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: forki
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ''
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-05
|
11
|
+
date: 2023-06-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|