zorki 0.1.3 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/zorki/monkeypatch.rb +3 -1
- data/lib/zorki/scrapers/post_scraper.rb +4 -1
- data/lib/zorki/scrapers/scraper.rb +21 -6
- data/lib/zorki/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 174766669fddfafe37892c58e5caa7706e9016244bda0621353178700631c8ef
|
4
|
+
data.tar.gz: 0b061de3dce20457b29ffbae043bb12a35a6deabca528effa1134f3757610403
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cf2b8fc81c18bd4a4e4095c81f1ea6832a0c17a625eb802e13a0cc37ea027a912992158d7b9aba199e0225d46020dd098410a2ab4d61b590ab37894e118127a5
|
7
|
+
data.tar.gz: 6d380eb69c798c66e0a77b504ca2b7f6494afb3c1ea9ea77eacd6868f0476c60003c9f1f46dfc8683c2290833715ece072f40de48b395ab93369e769be945227
|
data/lib/zorki/monkeypatch.rb
CHANGED
@@ -40,7 +40,9 @@ module SeleniumMonkeypatch
|
|
40
40
|
data = { method: method, params: params.compact }
|
41
41
|
data[:sessionId] = @session_id if @session_id
|
42
42
|
message = @ws.send_cmd(**data)
|
43
|
-
|
43
|
+
if message.nil? == false && message["error"] && (method != "Fetch.continueRequest")
|
44
|
+
raise Error::WebDriverError, error_message(message["error"])
|
45
|
+
end
|
44
46
|
|
45
47
|
message
|
46
48
|
end
|
@@ -23,6 +23,8 @@ module Zorki
|
|
23
23
|
"data,xdt_api__v1__media__shortcode__web_info,items"
|
24
24
|
)
|
25
25
|
|
26
|
+
graphql_object = graphql_object.first if graphql_object.kind_of?(Array)
|
27
|
+
|
26
28
|
# For pages that have been marked misinfo the structure is very different than not
|
27
29
|
# If it is a clean post then it's just a schema.org thing, but if it's misinfo it's the old
|
28
30
|
# way of deeply nested stuff.
|
@@ -54,7 +56,8 @@ module Zorki
|
|
54
56
|
else
|
55
57
|
# We need to see if this is a single image post or a slideshow. We do that
|
56
58
|
# by looking for a single image, if it's not there, we assume the alternative.
|
57
|
-
|
59
|
+
# debugger
|
60
|
+
# graphql_object = graphql_object["data"]["xdt_api__v1__media__shortcode__web_info"]
|
58
61
|
|
59
62
|
unless graphql_object["items"][0].has_key?("video_versions") && !graphql_object["items"][0]["video_versions"].nil?
|
60
63
|
# Check if there is a slideshow or not
|
@@ -5,9 +5,9 @@ require "dotenv/load"
|
|
5
5
|
require "oj"
|
6
6
|
require "selenium-webdriver"
|
7
7
|
require "logger"
|
8
|
-
require "debug"
|
9
8
|
require "securerandom"
|
10
9
|
require "selenium/webdriver/remote/http/curb"
|
10
|
+
require "debug"
|
11
11
|
|
12
12
|
# 2022-06-07 14:15:23 WARN Selenium [DEPRECATION] [:browser_options] :options as a parameter for driver initialization is deprecated. Use :capabilities with an Array of value capabilities/options if necessary instead.
|
13
13
|
|
@@ -112,12 +112,27 @@ module Zorki
|
|
112
112
|
# TODO: put this before the whole load loop
|
113
113
|
if response_body.nil?
|
114
114
|
doc = Nokogiri::HTML(page.driver.browser.page_source)
|
115
|
-
elements = doc.search("script").find_all do |e|
|
116
|
-
|
117
|
-
end
|
115
|
+
# elements = doc.search("script").find_all do |e|
|
116
|
+
# e.attributes.has_key?("type") && e.attributes["type"].value == "application/ld+json"
|
117
|
+
# end
|
118
|
+
|
119
|
+
elements = doc.search("script").map do |element|
|
120
|
+
element_json = nil
|
121
|
+
begin
|
122
|
+
element_json = JSON.parse(element)
|
123
|
+
|
124
|
+
element_json = element_json["require"].first.last.first["__bbox"]["require"].first.last.last["__bbox"]["result"]["data"]["xdt_api__v1__media__shortcode__web_info"]
|
125
|
+
rescue StandardError => e
|
126
|
+
next
|
127
|
+
end
|
128
|
+
|
129
|
+
element_json
|
130
|
+
end.compact
|
118
131
|
|
119
|
-
|
120
|
-
|
132
|
+
if elements&.empty?
|
133
|
+
raise ContentUnavailableError
|
134
|
+
end
|
135
|
+
return elements
|
121
136
|
end
|
122
137
|
|
123
138
|
raise ContentUnavailableError if response_body.nil?
|
data/lib/zorki/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zorki
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Christopher Guess
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-08-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|