forki 0.2.10 → 0.2.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +21 -7
- data/forki.gemspec +1 -0
- data/lib/forki/scrapers/post_scraper.rb +12 -2
- data/lib/forki/scrapers/scraper.rb +2 -2
- data/lib/forki/scrapers/user_scraper.rb +43 -1
- data/lib/forki/user.rb +1 -1
- data/lib/forki/version.rb +1 -1
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5c0373d5017b66032feca78ff34014da595078bd2ea276aed8590199ba386397
|
4
|
+
data.tar.gz: 41f53e6c34b490d9cf7f24ad6ff43c82c8e596c789938c1a8026411a5111245b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c9cb10df9e69bb07c1b8f2d1da2e3369ddfaddf1900ad94e5d9568c50f4045df994939f81a9d7543371ee53b8376f93d2e2e62370a69ccffb598b83362e85ac2
|
7
|
+
data.tar.gz: f553f6ad7d90aece2501f1ef6d0cd8c787f1ce8bc051480bdc87dab8fa43721923670b76aa3a3b1b8a6fbc46dd834dde2025ffd79dc00987ba9bcd6899bd99f4
|
data/Gemfile.lock
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
forki (0.2.
|
4
|
+
forki (0.2.11)
|
5
5
|
apparition
|
6
6
|
capybara
|
7
7
|
oj
|
8
8
|
selenium-webdriver
|
9
9
|
typhoeus
|
10
|
+
zorki
|
10
11
|
|
11
12
|
GEM
|
12
13
|
remote: https://rubygems.org/
|
@@ -29,8 +30,8 @@ GEM
|
|
29
30
|
i18n (>= 1.6, < 2)
|
30
31
|
minitest (>= 5.1)
|
31
32
|
tzinfo (~> 2.0)
|
32
|
-
addressable (2.8.
|
33
|
-
public_suffix (>= 2.0.2, <
|
33
|
+
addressable (2.8.7)
|
34
|
+
public_suffix (>= 2.0.2, < 7.0)
|
34
35
|
apparition (0.6.0)
|
35
36
|
capybara (~> 3.13, < 4)
|
36
37
|
websocket-driver (>= 0.6.5)
|
@@ -59,6 +60,7 @@ GEM
|
|
59
60
|
i18n (1.13.0)
|
60
61
|
concurrent-ruby (~> 1.0)
|
61
62
|
json (2.6.3)
|
63
|
+
logger (1.6.1)
|
62
64
|
loofah (2.21.3)
|
63
65
|
crass (~> 1.0.2)
|
64
66
|
nokogiri (>= 1.12.0)
|
@@ -68,12 +70,14 @@ GEM
|
|
68
70
|
minitest (5.18.0)
|
69
71
|
nokogiri (1.15.1-arm64-darwin)
|
70
72
|
racc (~> 1.4)
|
71
|
-
oj (3.16.
|
73
|
+
oj (3.16.6)
|
72
74
|
bigdecimal (>= 3.0)
|
75
|
+
ostruct (>= 0.2)
|
76
|
+
ostruct (0.6.0)
|
73
77
|
parallel (1.23.0)
|
74
78
|
parser (3.2.2.1)
|
75
79
|
ast (~> 2.4.1)
|
76
|
-
public_suffix (
|
80
|
+
public_suffix (6.0.1)
|
77
81
|
racc (1.6.2)
|
78
82
|
rack (2.2.4)
|
79
83
|
rack-test (2.1.0)
|
@@ -130,8 +134,11 @@ GEM
|
|
130
134
|
rubocop-rails (~> 2.0)
|
131
135
|
ruby-progressbar (1.13.0)
|
132
136
|
rubyzip (2.3.2)
|
133
|
-
selenium-
|
137
|
+
selenium-devtools (0.129.0)
|
138
|
+
selenium-webdriver (~> 4.2)
|
139
|
+
selenium-webdriver (4.24.0)
|
134
140
|
base64 (~> 0.2)
|
141
|
+
logger (~> 1.4)
|
135
142
|
rexml (~> 3.2, >= 3.2.5)
|
136
143
|
rubyzip (>= 1.2.2, < 3.0)
|
137
144
|
websocket (~> 1.0)
|
@@ -141,13 +148,20 @@ GEM
|
|
141
148
|
tzinfo (2.0.6)
|
142
149
|
concurrent-ruby (~> 1.0)
|
143
150
|
unicode-display_width (2.4.2)
|
144
|
-
websocket (1.2.
|
151
|
+
websocket (1.2.11)
|
145
152
|
websocket-driver (0.7.6)
|
146
153
|
websocket-extensions (>= 0.1.0)
|
147
154
|
websocket-extensions (0.1.5)
|
148
155
|
xpath (3.2.0)
|
149
156
|
nokogiri (~> 1.8)
|
150
157
|
zeitwerk (2.6.8)
|
158
|
+
zorki (0.2.6)
|
159
|
+
apparition
|
160
|
+
capybara
|
161
|
+
oj
|
162
|
+
selenium-devtools
|
163
|
+
selenium-webdriver (~> 4.24.0)
|
164
|
+
typhoeus
|
151
165
|
|
152
166
|
PLATFORMS
|
153
167
|
arm64-darwin-21
|
data/forki.gemspec
CHANGED
@@ -36,6 +36,7 @@ Gem::Specification.new do |spec|
|
|
36
36
|
spec.add_dependency "typhoeus" # For making API requests
|
37
37
|
spec.add_dependency "oj" # A faster JSON parser/loader than stdlib
|
38
38
|
spec.add_dependency "selenium-webdriver" # Webdriver selenium
|
39
|
+
spec.add_dependency "zorki" # For scraping instagram pages
|
39
40
|
|
40
41
|
spec.add_development_dependency "thor" # For the generator
|
41
42
|
|
@@ -505,7 +505,11 @@ module Forki
|
|
505
505
|
# Do nothing if element not found
|
506
506
|
end
|
507
507
|
|
508
|
-
|
508
|
+
begin
|
509
|
+
save_screenshot("#{Forki.temp_storage_location}/facebook_screenshot_#{SecureRandom.uuid}.png")
|
510
|
+
rescue Selenium::WebDriver::Error::TimeoutError
|
511
|
+
raise Net::ReadTimeout
|
512
|
+
end
|
509
513
|
end
|
510
514
|
|
511
515
|
# Uses GraphQL data and DOM elements to collect information about the current post
|
@@ -536,14 +540,20 @@ module Forki
|
|
536
540
|
rescue Net::ReadTimeout => e
|
537
541
|
puts "Time out error: #{e}"
|
538
542
|
puts e.backtrace
|
543
|
+
raise Forki::RetryableError # This insures it'll eventually be retried by Hypatia
|
539
544
|
rescue StandardError => e
|
540
545
|
raise e
|
546
|
+
raise Forki::RetryableError
|
541
547
|
ensure
|
542
548
|
# `page` here can be broken already. In which case we want to raise an error so it's retried later
|
543
549
|
begin
|
544
550
|
page.quit
|
545
551
|
rescue Curl::Err::ConnectionFailedError
|
546
|
-
raise Forki::
|
552
|
+
raise Forki::RetryableError # This insures it'll eventually be retried by Hypatia
|
553
|
+
rescue StandardError => e
|
554
|
+
puts "Error closing browser: #{e}"
|
555
|
+
raise e
|
556
|
+
# raise Forki::RetryableError
|
547
557
|
end
|
548
558
|
end
|
549
559
|
end
|
@@ -178,10 +178,10 @@ module Forki
|
|
178
178
|
# If either of those two conditions are false, raises an exception
|
179
179
|
def validate_and_load_page(url)
|
180
180
|
Capybara.app_host = "https://www.facebook.com"
|
181
|
-
facebook_hosts = ["facebook.com", "www.facebook.com", "web.facebook.com", "m.facebook.com"]
|
181
|
+
facebook_hosts = ["facebook.com", "www.facebook.com", "web.facebook.com", "m.facebook.com", "l.facebook.com"]
|
182
182
|
parsed_url = URI.parse(url)
|
183
183
|
host = parsed_url.host
|
184
|
-
raise Forki::InvalidUrlError unless facebook_hosts.include?(host)
|
184
|
+
raise Forki::InvalidUrlError.new("Invalid Facebook host: #{host}") unless facebook_hosts.include?(host)
|
185
185
|
|
186
186
|
# Replace the host with a default one to prevent redirect loops that can happen
|
187
187
|
unless parsed_url.host == "www.facebook.com"
|
@@ -1,10 +1,12 @@
|
|
1
1
|
require "typhoeus"
|
2
|
+
require "zorki"
|
2
3
|
|
4
|
+
# rubocop:disable Metrics/ClassLength
|
3
5
|
module Forki
|
4
6
|
class UserScraper < Scraper
|
5
7
|
# Finds and returns the number of people who like the current page
|
6
8
|
def find_number_of_likes(profile_details_string)
|
7
|
-
likes_pattern = /[0-9,.KM ] likes/
|
9
|
+
# likes_pattern = /[0-9,.KM ] likes/
|
8
10
|
likes_pattern = /(?<num_likes>[0-9,.KM ]+) (l|L)ikes/
|
9
11
|
number_of_likes_match = likes_pattern.match(profile_details_string)
|
10
12
|
|
@@ -100,7 +102,14 @@ module Forki
|
|
100
102
|
|
101
103
|
# Uses GraphQL data and DOM elements to collect information about the current user page
|
102
104
|
def parse(url)
|
105
|
+
# So some reels may actually link to an instagram user?
|
106
|
+
if url.include?("instagram.com")
|
107
|
+
user = get_instagram_user(url)
|
108
|
+
return user
|
109
|
+
end
|
110
|
+
|
103
111
|
validate_and_load_page(url)
|
112
|
+
|
104
113
|
graphql_strings = find_graphql_data_strings(page.html)
|
105
114
|
is_page = graphql_strings.map { |s| JSON.parse(s) }.any? { |o| o.key?("page") }
|
106
115
|
user_details = is_page ? extract_page_details(graphql_strings) : extract_profile_details(graphql_strings)
|
@@ -110,5 +119,38 @@ module Forki
|
|
110
119
|
|
111
120
|
user_details
|
112
121
|
end
|
122
|
+
|
123
|
+
def get_instagram_user(url)
|
124
|
+
uri = URI(url)
|
125
|
+
query = uri.query
|
126
|
+
components = URI.decode_uri_component(query)
|
127
|
+
extracted_url = URI.extract(components).first
|
128
|
+
extracted_uri = URI(extracted_url)
|
129
|
+
username = extracted_uri.to_s.match(/(https:\/\/www.instagram.com\/_u\/[\w]+)/).to_s.split("/").last
|
130
|
+
|
131
|
+
page.quit # I think we need to do this so Zorki can run?
|
132
|
+
zorki_users = Zorki::User.lookup(username)
|
133
|
+
zorki_user = nil
|
134
|
+
|
135
|
+
if zorki_users.count.positive?
|
136
|
+
zorki_user = zorki_users.first
|
137
|
+
else
|
138
|
+
raise ContentUnavailableError
|
139
|
+
end
|
140
|
+
|
141
|
+
|
142
|
+
# Convert a zorki_user into a hash for Forki
|
143
|
+
{
|
144
|
+
name: zorki_user.name,
|
145
|
+
id: username,
|
146
|
+
number_of_followers: zorki_user.number_of_followers,
|
147
|
+
verified: zorki_user.verified,
|
148
|
+
profile: zorki_user.profile,
|
149
|
+
profile_link: zorki_user.profile_link,
|
150
|
+
profile_image_file: zorki_user.profile_image,
|
151
|
+
profile_image_url: zorki_user.profile_image_url,
|
152
|
+
number_of_likes: 0
|
153
|
+
}
|
154
|
+
end
|
113
155
|
end
|
114
156
|
end
|
data/lib/forki/user.rb
CHANGED
data/lib/forki/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: forki
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ''
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-10-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|
@@ -80,6 +80,20 @@ dependencies:
|
|
80
80
|
- - ">="
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: zorki
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: thor
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|