zorki 0.1.3 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/zorki/monkeypatch.rb +3 -1
- data/lib/zorki/scrapers/post_scraper.rb +4 -1
- data/lib/zorki/scrapers/scraper.rb +21 -6
- data/lib/zorki/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 174766669fddfafe37892c58e5caa7706e9016244bda0621353178700631c8ef
|
4
|
+
data.tar.gz: 0b061de3dce20457b29ffbae043bb12a35a6deabca528effa1134f3757610403
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cf2b8fc81c18bd4a4e4095c81f1ea6832a0c17a625eb802e13a0cc37ea027a912992158d7b9aba199e0225d46020dd098410a2ab4d61b590ab37894e118127a5
|
7
|
+
data.tar.gz: 6d380eb69c798c66e0a77b504ca2b7f6494afb3c1ea9ea77eacd6868f0476c60003c9f1f46dfc8683c2290833715ece072f40de48b395ab93369e769be945227
|
data/lib/zorki/monkeypatch.rb
CHANGED
@@ -40,7 +40,9 @@ module SeleniumMonkeypatch
|
|
40
40
|
data = { method: method, params: params.compact }
|
41
41
|
data[:sessionId] = @session_id if @session_id
|
42
42
|
message = @ws.send_cmd(**data)
|
43
|
-
|
43
|
+
if message.nil? == false && message["error"] && (method != "Fetch.continueRequest")
|
44
|
+
raise Error::WebDriverError, error_message(message["error"])
|
45
|
+
end
|
44
46
|
|
45
47
|
message
|
46
48
|
end
|
@@ -23,6 +23,8 @@ module Zorki
|
|
23
23
|
"data,xdt_api__v1__media__shortcode__web_info,items"
|
24
24
|
)
|
25
25
|
|
26
|
+
graphql_object = graphql_object.first if graphql_object.kind_of?(Array)
|
27
|
+
|
26
28
|
# For pages that have been marked misinfo the structure is very different than not
|
27
29
|
# If it is a clean post then it's just a schema.org thing, but if it's misinfo it's the old
|
28
30
|
# way of deeply nested stuff.
|
@@ -54,7 +56,8 @@ module Zorki
|
|
54
56
|
else
|
55
57
|
# We need to see if this is a single image post or a slideshow. We do that
|
56
58
|
# by looking for a single image, if it's not there, we assume the alternative.
|
57
|
-
|
59
|
+
# debugger
|
60
|
+
# graphql_object = graphql_object["data"]["xdt_api__v1__media__shortcode__web_info"]
|
58
61
|
|
59
62
|
unless graphql_object["items"][0].has_key?("video_versions") && !graphql_object["items"][0]["video_versions"].nil?
|
60
63
|
# Check if there is a slideshow or not
|
@@ -5,9 +5,9 @@ require "dotenv/load"
|
|
5
5
|
require "oj"
|
6
6
|
require "selenium-webdriver"
|
7
7
|
require "logger"
|
8
|
-
require "debug"
|
9
8
|
require "securerandom"
|
10
9
|
require "selenium/webdriver/remote/http/curb"
|
10
|
+
require "debug"
|
11
11
|
|
12
12
|
# 2022-06-07 14:15:23 WARN Selenium [DEPRECATION] [:browser_options] :options as a parameter for driver initialization is deprecated. Use :capabilities with an Array of value capabilities/options if necessary instead.
|
13
13
|
|
@@ -112,12 +112,27 @@ module Zorki
|
|
112
112
|
# TODO: put this before the whole load loop
|
113
113
|
if response_body.nil?
|
114
114
|
doc = Nokogiri::HTML(page.driver.browser.page_source)
|
115
|
-
elements = doc.search("script").find_all do |e|
|
116
|
-
|
117
|
-
end
|
115
|
+
# elements = doc.search("script").find_all do |e|
|
116
|
+
# e.attributes.has_key?("type") && e.attributes["type"].value == "application/ld+json"
|
117
|
+
# end
|
118
|
+
|
119
|
+
elements = doc.search("script").map do |element|
|
120
|
+
element_json = nil
|
121
|
+
begin
|
122
|
+
element_json = JSON.parse(element)
|
123
|
+
|
124
|
+
element_json = element_json["require"].first.last.first["__bbox"]["require"].first.last.last["__bbox"]["result"]["data"]["xdt_api__v1__media__shortcode__web_info"]
|
125
|
+
rescue StandardError => e
|
126
|
+
next
|
127
|
+
end
|
128
|
+
|
129
|
+
element_json
|
130
|
+
end.compact
|
118
131
|
|
119
|
-
|
120
|
-
|
132
|
+
if elements&.empty?
|
133
|
+
raise ContentUnavailableError
|
134
|
+
end
|
135
|
+
return elements
|
121
136
|
end
|
122
137
|
|
123
138
|
raise ContentUnavailableError if response_body.nil?
|
data/lib/zorki/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zorki
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Christopher Guess
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-08-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|