webshaker 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +89 -6
- data/lib/webshaker/ai.rb +42 -0
- data/lib/webshaker/shaker.rb +3 -30
- data/lib/webshaker/version.rb +1 -1
- data/lib/webshaker.rb +1 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4b03ca48d3495f70d5097d85325b4ab917b4ff0217b6634fcc50a9649e7ffc5e
|
4
|
+
data.tar.gz: 12925527140a41b0dd5e64a5551292a0fc97a7d42221bbe636d6517ec3d8d6ac
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2e2bc1973b04b9fec176b7cde9aa07908c29b83a9ec074d25d983e5f2b025b60b84ce6635483566325222868224f598d07ce9a8e7d4d778873104f920a0369d8
|
7
|
+
data.tar.gz: 88ac426fa0c12c9e2e2a45e2cf6ed23264402ef05c9be30f9cbd0b18c83caba46d38f26665197fb354161b7efb9fd692b8f0474928f990f8b613c0552f38ca6d
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,11 @@
|
|
1
1
|
# Webshaker
|
2
2
|
|
3
|
-
|
3
|
+
|
4
|
+
| Tests | Coverage |
|
5
|
+
|:-:|:-:|
|
6
|
+
| [](https://github.com/mochetts/webshaker/actions/workflows/main.yml) | [](https://codecov.io/github/mochetts/webshaker) |
|
7
|
+
|
8
|
+
An intelligent web scraper that uses Selenium WebDriver to scrape a URL and parse it using AI.
|
4
9
|
|
5
10
|
## Installation
|
6
11
|
|
@@ -30,17 +35,81 @@ end
|
|
30
35
|
|
31
36
|
```ruby
|
32
37
|
# Create a shaker out of a specific URL.
|
33
|
-
|
34
|
-
|
35
|
-
#
|
38
|
+
shaker = Webshaker::Shaker.new("https://www.google.com")
|
39
|
+
|
40
|
+
# Query anything about the website.
|
41
|
+
result = shaker.shake(with_prompt: "What's this website about?")
|
42
|
+
# => "This website appears to be the homepage of Google, specifically targeting users in Uruguay (as indicated by the reference to \"Uruguay\" and the Spanish language). It contains links to various Google services such as Gmail, Google Images, and a login page for Google accounts. Additionally, there are sections for user feedback, search functionalities, and links to Google policies and services. The layout includes buttons, forms, and elements for user interaction, typical of a search engine homepage."
|
43
|
+
|
44
|
+
result = shaker.shake(with_prompt: "Give me a list of all the links", respond_with: :json)
|
45
|
+
# =>
|
46
|
+
# {"links"=>
|
47
|
+
# ["https://mail.google.com/mail/&ogbl",
|
48
|
+
# "https://www.google.com/imghp?hl=es-419&ogbl",
|
49
|
+
# "https://accounts.google.com/ServiceLogin?hl=es-419&passive=true&continue=https://www.google.com/&ec=GAZAmgQ",
|
50
|
+
# "/search?sca_esv=08a7c6c574dce941&sca_upv=1&q=vela+olimpiadas&oi=ddle&ct=335645951&hl=es-419&sa=X&ved=0ahUKEwiBsrmL9taHAxWGq5UCHV1hEJkQPQgC",
|
51
|
+
# "https://about.google/?utm_source=google-UY&utm_medium=referral&utm_campaign=hp-footer&fg=1",
|
52
|
+
# "https://www.google.com/intl/es-419_uy/ads/?subid=ww-ww-et-g-awa-a-g_hpafoot1_1!o2&utm_source=google.com&utm_medium=referral&utm_campaign=google_hpafooter&fg=1",
|
53
|
+
# "https://www.google.com/services/?subid=ww-ww-et-g-awa-a-g_hpbfoot1_1!o2&utm_source=google.com&utm_medium=referral&utm_campaign=google_hpbfooter&fg=1",
|
54
|
+
# "https://google.com/search/howsearchworks/?fg=1",
|
55
|
+
# "https://policies.google.com/privacy?hl=es-419&fg=1",
|
56
|
+
# "https://policies.google.com/terms?hl=es-419&fg=1",
|
57
|
+
# "https://www.google.com/preferences?hl=es-419&fg=1",
|
58
|
+
# "/advanced_search?hl=es-419&fg=1",
|
59
|
+
# "/history/privacyadvisor/search/unauth?utm_source=googlemenu&fg=1&cctld=com",
|
60
|
+
# "/history/optout?hl=es-419&fg=1",
|
61
|
+
# "https://support.google.com/websearch/?p=ws_results_help&hl=es-419&fg=1"]}
|
62
|
+
```
|
63
|
+
|
64
|
+
### SPA dynamic hydration
|
65
|
+
|
66
|
+
Sometimes, the page we want to scrape is a Single Page Application. This means that the initial HTML that is returned by the server does not contain the html that we want to scrape but a just a skeleton that gets filled in by JS scripting.
|
67
|
+
|
68
|
+
In order to circumvent this, you can let the webshaker know that it needs to wait for some html content to appear before it scrapes the HTML.
|
69
|
+
|
70
|
+
For example, you can wait for a certian XPath to show up:
|
71
|
+
|
72
|
+
```rb
|
36
73
|
shaker = Webshaker::Shaker.new(
|
37
74
|
"https://www.google.com",
|
38
75
|
{wait_for: {xpath: "/html/body/div[1]/div[3]/form/div[1]/div[1]/div[1]/div/div[3]"}}
|
39
76
|
)
|
40
77
|
|
41
|
-
# Query anything about the website.
|
42
78
|
result = shaker.shake(with_prompt: "What's this website about?")
|
43
|
-
|
79
|
+
```
|
80
|
+
|
81
|
+
You can also wait for certain css selector to be found:
|
82
|
+
|
83
|
+
```rb
|
84
|
+
shaker = Webshaker::Shaker.new(
|
85
|
+
"https://www.google.com",
|
86
|
+
{wait_for: {css: ".some-class"}}
|
87
|
+
)
|
88
|
+
|
89
|
+
result = shaker.shake(with_prompt: "What's this website about?")
|
90
|
+
```
|
91
|
+
|
92
|
+
Or you can wait for some specific node to become available:
|
93
|
+
|
94
|
+
```rb
|
95
|
+
shaker = Webshaker::Shaker.new(
|
96
|
+
"https://www.google.com",
|
97
|
+
{wait_for: {tag_name: "body"}}
|
98
|
+
)
|
99
|
+
|
100
|
+
result = shaker.shake(with_prompt: "What's this website about?")
|
101
|
+
```
|
102
|
+
|
103
|
+
### Responding with JSON
|
104
|
+
|
105
|
+
In some scenarios (e.g when the communication is between 2 systems), it might be better to return json. To do so, just add the `respond_with: :json` keyword param to the `shake` call:
|
106
|
+
|
107
|
+
```ruby
|
108
|
+
# Create a shaker out of a specific URL.
|
109
|
+
shaker = Webshaker::Shaker.new(
|
110
|
+
"https://www.google.com",
|
111
|
+
{wait_for: {xpath: "/html/body/div[1]/div[3]/form/div[1]/div[1]/div[1]/div/div[3]"}}
|
112
|
+
)
|
44
113
|
|
45
114
|
result = shaker.shake(with_prompt: "Give me a list of all the links", respond_with: :json)
|
46
115
|
# =>
|
@@ -62,6 +131,20 @@ result = shaker.shake(with_prompt: "Give me a list of all the links", respond_wi
|
|
62
131
|
# "https://support.google.com/websearch/?p=ws_results_help&hl=es-419&fg=1"]}
|
63
132
|
```
|
64
133
|
|
134
|
+
### Temperature
|
135
|
+
|
136
|
+
Sometimes we need to adjust the prompt temperature to give the result a bit of randomness. In order to do so, you can simply pass the `temperature` keyword to the `shake` call:
|
137
|
+
|
138
|
+
```rb
|
139
|
+
# Create a shaker out of a specific URL.
|
140
|
+
shaker = Webshaker::Shaker.new(
|
141
|
+
"https://www.google.com",
|
142
|
+
{wait_for: {xpath: "/html/body/div[1]/div[3]/form/div[1]/div[1]/div[1]/div/div[3]"}}
|
143
|
+
)
|
144
|
+
|
145
|
+
result = shaker.shake(with_prompt: "Give me a list of all the links", respond_with: :json, temperature: 0.2)
|
146
|
+
```
|
147
|
+
|
65
148
|
## Development
|
66
149
|
|
67
150
|
You can use `bin/console` to access an interactive console. This will preload environment variables from a `.env` file.
|
data/lib/webshaker/ai.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
module Webshaker
|
2
|
+
class Ai
|
3
|
+
attr_reader :html_content
|
4
|
+
|
5
|
+
def initialize(html_content)
|
6
|
+
@html_content = html_content
|
7
|
+
end
|
8
|
+
|
9
|
+
def analyze(with_prompt:, respond_with: :text, temperature: 0.8)
|
10
|
+
response = ai_client.chat(
|
11
|
+
parameters: {
|
12
|
+
model: Webshaker.config.model,
|
13
|
+
messages: messages(with_prompt).concat((respond_with.to_sym == :json) ? [{role: "user", content: "respond with json"}] : []),
|
14
|
+
temperature:
|
15
|
+
}.merge(
|
16
|
+
(respond_with == :json) ? {response_format: {type: "json_object"}} : {}
|
17
|
+
)
|
18
|
+
)
|
19
|
+
|
20
|
+
# Return full response from the ai client if the respond_with is set to :full
|
21
|
+
return response if respond_with === :full
|
22
|
+
|
23
|
+
response = response["choices"][0]["message"]["content"]
|
24
|
+
response = JSON.parse(response) if respond_with === :json
|
25
|
+
response
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def ai_client
|
31
|
+
@client ||= OpenAI::Client.new(access_token: Webshaker.config.open_ai_key)
|
32
|
+
end
|
33
|
+
|
34
|
+
def messages(user_prompt)
|
35
|
+
[
|
36
|
+
{role: "system", content: "You are an HTML interpreter. The user will give you the contents of an HTML and ask you something about it. Please comply with what the user asks."},
|
37
|
+
{role: "user", content: html_content},
|
38
|
+
{role: "user", content: user_prompt}
|
39
|
+
]
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
data/lib/webshaker/shaker.rb
CHANGED
@@ -10,11 +10,7 @@ module Webshaker
|
|
10
10
|
end
|
11
11
|
|
12
12
|
def shake(with_prompt:, respond_with: :text, temperature: 0.8)
|
13
|
-
|
14
|
-
with_prompt,
|
15
|
-
respond_with:,
|
16
|
-
temperature:
|
17
|
-
)
|
13
|
+
ai.analyze(with_prompt:, respond_with: :text, temperature: 0.8)
|
18
14
|
end
|
19
15
|
|
20
16
|
private
|
@@ -23,31 +19,8 @@ module Webshaker
|
|
23
19
|
@html ||= Webshaker::Scraper.new(url, scrape_options).scrape.html
|
24
20
|
end
|
25
21
|
|
26
|
-
def
|
27
|
-
|
28
|
-
parameters: {
|
29
|
-
model: Webshaker.config.model,
|
30
|
-
messages: ai_prompt(user_prompt, html).concat((respond_with == :json) ? [{role: "user", content: "respond with json"}] : []),
|
31
|
-
temperature:
|
32
|
-
}.merge(
|
33
|
-
(respond_with == :json) ? {response_format: {type: "json_object"}} : {}
|
34
|
-
)
|
35
|
-
)
|
36
|
-
response = response["choices"][0]["message"]["content"]
|
37
|
-
response = JSON.parse(response) if respond_with === :json
|
38
|
-
response
|
39
|
-
end
|
40
|
-
|
41
|
-
def ai_client
|
42
|
-
@client ||= OpenAI::Client.new(access_token: Webshaker.config.open_ai_key)
|
43
|
-
end
|
44
|
-
|
45
|
-
def ai_prompt(user_prompt, html_content)
|
46
|
-
[
|
47
|
-
{role: "system", content: "You are an HTML interpreter. The user will give you the contents of an HTML and ask you something about it. Please comply with what the user asks."},
|
48
|
-
{role: "user", content: html_content},
|
49
|
-
{role: "user", content: user_prompt}
|
50
|
-
]
|
22
|
+
def ai
|
23
|
+
@ai ||= Webshaker::Ai.new(html)
|
51
24
|
end
|
52
25
|
end
|
53
26
|
end
|
data/lib/webshaker/version.rb
CHANGED
data/lib/webshaker.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webshaker
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Martin Mochetti
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-08-
|
11
|
+
date: 2024-08-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: selenium-webdriver
|
@@ -70,6 +70,7 @@ files:
|
|
70
70
|
- README.md
|
71
71
|
- Rakefile
|
72
72
|
- lib/webshaker.rb
|
73
|
+
- lib/webshaker/ai.rb
|
73
74
|
- lib/webshaker/scrape_result.rb
|
74
75
|
- lib/webshaker/scraper.rb
|
75
76
|
- lib/webshaker/shaker.rb
|