flunky 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +38 -0
- data/.ruby-version +1 -0
- data/CHANGELOG.md +5 -0
- data/CODE_OF_CONDUCT.md +84 -0
- data/Gemfile +13 -0
- data/LICENSE.txt +21 -0
- data/README.md +100 -0
- data/Rakefile +12 -0
- data/examples/anthropic_agent.rb +89 -0
- data/flunky.gemspec +41 -0
- data/lib/flunky/actions.rb +100 -0
- data/lib/flunky/agent.rb +53 -0
- data/lib/flunky/configuration.rb +48 -0
- data/lib/flunky/drivers/base.rb +73 -0
- data/lib/flunky/drivers/ferrum_driver.rb +147 -0
- data/lib/flunky/errors.rb +15 -0
- data/lib/flunky/js/snapshot.js +98 -0
- data/lib/flunky/session.rb +55 -0
- data/lib/flunky/snapshot.rb +81 -0
- data/lib/flunky/tools.rb +76 -0
- data/lib/flunky/version.rb +5 -0
- data/lib/flunky.rb +24 -0
- data/sig/flunky.rbs +4 -0
- metadata +87 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 396af494516ace4e790c451a749a47b96db080bfd9d386825902f249353b2196
|
|
4
|
+
data.tar.gz: 4f01dff6543ab4365691de268616669331fd3ec2d59812d6c35afe1dc362fe78
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 22f541f37531252d1ac60a1183f53c975f471d9ac45cc8a13e9e80f8bbfc2882c5203360089a766fb91d4edea513a9da5410d573c1e708f5b893df518918f3a8
|
|
7
|
+
data.tar.gz: 7d50a1ba1ae192c0b9a468558499e5c1caf25a3b110b3e9a690d2592c5c89c85e8f3ce5dd94c60dd164f16b261bdb5701d4faf277e6e429f16fcd8eb22014e1d
|
data/.rspec
ADDED
data/.rubocop.yml
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
AllCops:
|
|
2
|
+
TargetRubyVersion: 3.0
|
|
3
|
+
NewCops: enable
|
|
4
|
+
SuggestExtensions: false
|
|
5
|
+
Exclude:
|
|
6
|
+
- "examples/**/*"
|
|
7
|
+
# CI installs gems here via bundler-cache; never lint third-party code.
|
|
8
|
+
- "vendor/**/*"
|
|
9
|
+
|
|
10
|
+
Metrics:
|
|
11
|
+
Enabled: false
|
|
12
|
+
|
|
13
|
+
Style/Documentation:
|
|
14
|
+
Enabled: false
|
|
15
|
+
|
|
16
|
+
# Short, conventional names (dx, dy, el) are intentional here.
|
|
17
|
+
Naming/MethodParameterName:
|
|
18
|
+
MinNameLength: 2
|
|
19
|
+
|
|
20
|
+
# Command methods on the driver return true to signal success; they are not
|
|
21
|
+
# predicates and should not be forced to end with "?".
|
|
22
|
+
Naming/PredicateMethod:
|
|
23
|
+
Enabled: false
|
|
24
|
+
|
|
25
|
+
# The abstract driver declares keyword arguments it does not itself use.
|
|
26
|
+
Lint/UnusedMethodArgument:
|
|
27
|
+
AllowUnusedKeywordArguments: true
|
|
28
|
+
|
|
29
|
+
Style/StringLiterals:
|
|
30
|
+
Enabled: true
|
|
31
|
+
EnforcedStyle: double_quotes
|
|
32
|
+
|
|
33
|
+
Style/StringLiteralsInInterpolation:
|
|
34
|
+
Enabled: true
|
|
35
|
+
EnforcedStyle: double_quotes
|
|
36
|
+
|
|
37
|
+
Layout/LineLength:
|
|
38
|
+
Max: 120
|
data/.ruby-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.3.6
|
data/CHANGELOG.md
ADDED
data/CODE_OF_CONDUCT.md
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# Contributor Covenant Code of Conduct
|
|
2
|
+
|
|
3
|
+
## Our Pledge
|
|
4
|
+
|
|
5
|
+
We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation.
|
|
6
|
+
|
|
7
|
+
We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community.
|
|
8
|
+
|
|
9
|
+
## Our Standards
|
|
10
|
+
|
|
11
|
+
Examples of behavior that contributes to a positive environment for our community include:
|
|
12
|
+
|
|
13
|
+
* Demonstrating empathy and kindness toward other people
|
|
14
|
+
* Being respectful of differing opinions, viewpoints, and experiences
|
|
15
|
+
* Giving and gracefully accepting constructive feedback
|
|
16
|
+
* Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience
|
|
17
|
+
* Focusing on what is best not just for us as individuals, but for the overall community
|
|
18
|
+
|
|
19
|
+
Examples of unacceptable behavior include:
|
|
20
|
+
|
|
21
|
+
* The use of sexualized language or imagery, and sexual attention or
|
|
22
|
+
advances of any kind
|
|
23
|
+
* Trolling, insulting or derogatory comments, and personal or political attacks
|
|
24
|
+
* Public or private harassment
|
|
25
|
+
* Publishing others' private information, such as a physical or email
|
|
26
|
+
address, without their explicit permission
|
|
27
|
+
* Other conduct which could reasonably be considered inappropriate in a
|
|
28
|
+
professional setting
|
|
29
|
+
|
|
30
|
+
## Enforcement Responsibilities
|
|
31
|
+
|
|
32
|
+
Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful.
|
|
33
|
+
|
|
34
|
+
Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, and will communicate reasons for moderation decisions when appropriate.
|
|
35
|
+
|
|
36
|
+
## Scope
|
|
37
|
+
|
|
38
|
+
This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. Examples of representing our community include using an official e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event.
|
|
39
|
+
|
|
40
|
+
## Enforcement
|
|
41
|
+
|
|
42
|
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at alexrupom@hotmail.com. All complaints will be reviewed and investigated promptly and fairly.
|
|
43
|
+
|
|
44
|
+
All community leaders are obligated to respect the privacy and security of the reporter of any incident.
|
|
45
|
+
|
|
46
|
+
## Enforcement Guidelines
|
|
47
|
+
|
|
48
|
+
Community leaders will follow these Community Impact Guidelines in determining the consequences for any action they deem in violation of this Code of Conduct:
|
|
49
|
+
|
|
50
|
+
### 1. Correction
|
|
51
|
+
|
|
52
|
+
**Community Impact**: Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community.
|
|
53
|
+
|
|
54
|
+
**Consequence**: A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate. A public apology may be requested.
|
|
55
|
+
|
|
56
|
+
### 2. Warning
|
|
57
|
+
|
|
58
|
+
**Community Impact**: A violation through a single incident or series of actions.
|
|
59
|
+
|
|
60
|
+
**Consequence**: A warning with consequences for continued behavior. No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time. This includes avoiding interactions in community spaces as well as external channels like social media. Violating these terms may lead to a temporary or permanent ban.
|
|
61
|
+
|
|
62
|
+
### 3. Temporary Ban
|
|
63
|
+
|
|
64
|
+
**Community Impact**: A serious violation of community standards, including sustained inappropriate behavior.
|
|
65
|
+
|
|
66
|
+
**Consequence**: A temporary ban from any sort of interaction or public communication with the community for a specified period of time. No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period. Violating these terms may lead to a permanent ban.
|
|
67
|
+
|
|
68
|
+
### 4. Permanent Ban
|
|
69
|
+
|
|
70
|
+
**Community Impact**: Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior, harassment of an individual, or aggression toward or disparagement of classes of individuals.
|
|
71
|
+
|
|
72
|
+
**Consequence**: A permanent ban from any sort of public interaction within the community.
|
|
73
|
+
|
|
74
|
+
## Attribution
|
|
75
|
+
|
|
76
|
+
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 2.0,
|
|
77
|
+
available at https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
|
|
78
|
+
|
|
79
|
+
Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/diversity).
|
|
80
|
+
|
|
81
|
+
[homepage]: https://www.contributor-covenant.org
|
|
82
|
+
|
|
83
|
+
For answers to common questions about this code of conduct, see the FAQ at
|
|
84
|
+
https://www.contributor-covenant.org/faq. Translations are available at https://www.contributor-covenant.org/translations.
|
data/Gemfile
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
source "https://rubygems.org"
|
|
4
|
+
|
|
5
|
+
# Runtime dependencies live in flunky.gemspec (ferrum).
|
|
6
|
+
gemspec
|
|
7
|
+
|
|
8
|
+
# Development and test tooling: not needed by anyone who installs the gem.
|
|
9
|
+
group :development, :test do
|
|
10
|
+
gem "rake", "~> 13.0"
|
|
11
|
+
gem "rspec", "~> 3.0"
|
|
12
|
+
gem "rubocop", "~> 1.21"
|
|
13
|
+
end
|
data/LICENSE.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 alexrupom
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# Flunky
|
|
2
|
+
|
|
3
|
+
Flunky lets any AI agent drive a real browser. It borrows the layering of a
|
|
4
|
+
browser automation library (a thin driver, a stateful session, a readable action
|
|
5
|
+
DSL) and adds the two things an agent needs that a test framework does not: a way
|
|
6
|
+
to show the page to a model, and a way to expose actions as tools the model can
|
|
7
|
+
call.
|
|
8
|
+
|
|
9
|
+
The gem never calls an AI vendor itself. It emits tool schemas and a dispatcher;
|
|
10
|
+
you inject the model client.
|
|
11
|
+
|
|
12
|
+
## How it fits together
|
|
13
|
+
|
|
14
|
+
- **Driver** (`Drivers::Base`, `Drivers::FerrumDriver`) talks to the browser.
|
|
15
|
+
Ferrum drives Chrome over the DevTools Protocol with no Selenium server. The
|
|
16
|
+
backend is swappable.
|
|
17
|
+
- **Snapshot** reduces the live page to the elements an agent can act on, each
|
|
18
|
+
stamped with an integer `ref`, and renders a compact prompt block.
|
|
19
|
+
- **Actions** is the human-readable DSL over the driver (`click`, `type`,
|
|
20
|
+
`fill_in`, ...).
|
|
21
|
+
- **Session** owns one driver, caches the latest snapshot, and exposes the
|
|
22
|
+
actions.
|
|
23
|
+
- **Tools** turns a session into vendor-neutral tool schemas plus a dispatcher.
|
|
24
|
+
- **Agent** is an optional observe/decide/act loop around an injected model.
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
Flunky needs Ruby >= 3.0 and a local Chrome (Ferrum launches it).
|
|
29
|
+
|
|
30
|
+
```
|
|
31
|
+
bundle add flunky
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
or
|
|
35
|
+
|
|
36
|
+
```
|
|
37
|
+
gem install flunky
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Usage
|
|
41
|
+
|
|
42
|
+
```ruby
|
|
43
|
+
require "flunky"
|
|
44
|
+
|
|
45
|
+
Flunky.session do |s|
|
|
46
|
+
s.visit("https://example.com")
|
|
47
|
+
puts s.snapshot.to_prompt # the page as the model sees it
|
|
48
|
+
s.actions.click(1) # act on a stamped ref
|
|
49
|
+
end
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
`refs` only exist after a snapshot stamps them, so observe (or read
|
|
53
|
+
`snapshot`) before acting. After client-side navigation a ref can go stale; the
|
|
54
|
+
tool dispatcher re-observes after every action so the model always sees the
|
|
55
|
+
current page.
|
|
56
|
+
|
|
57
|
+
### Tool calling
|
|
58
|
+
|
|
59
|
+
```ruby
|
|
60
|
+
session = Flunky::Session.new
|
|
61
|
+
tools = Flunky::Tools.new(session)
|
|
62
|
+
|
|
63
|
+
tools.definitions # hand straight to an Anthropic style client
|
|
64
|
+
tools.dispatch("click", { ref: 1 }) # run a returned tool call
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
The tool schema shape (`name` / `description` / `input_schema`) matches
|
|
68
|
+
Anthropic's tool format. For OpenAI, wrap each definition as
|
|
69
|
+
`{ type: "function", function: { **defn, parameters: defn[:input_schema] } }`.
|
|
70
|
+
|
|
71
|
+
### Agent loop
|
|
72
|
+
|
|
73
|
+
`Flunky::Agent.new(session, model:)` drives an observe/decide/act loop. `model`
|
|
74
|
+
must respond to `call(messages:, tools:)` and return
|
|
75
|
+
`{ text:, tool_calls: [{ id:, name:, arguments: }] }`. See
|
|
76
|
+
[examples/anthropic_agent.rb](examples/anthropic_agent.rb) for a roughly 50 line
|
|
77
|
+
adapter to Anthropic's `/v1/messages` endpoint.
|
|
78
|
+
|
|
79
|
+
### Configuration
|
|
80
|
+
|
|
81
|
+
```ruby
|
|
82
|
+
Flunky.configure do |c|
|
|
83
|
+
c.headless = true
|
|
84
|
+
c.default_timeout = 10
|
|
85
|
+
c.max_elements = 200
|
|
86
|
+
c.window_size = [1280, 800]
|
|
87
|
+
end
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
Per-session options passed to `Session.new` override the global configuration.
|
|
91
|
+
|
|
92
|
+
## Development
|
|
93
|
+
|
|
94
|
+
After checking out the repo, run `bin/setup` to install dependencies, then
|
|
95
|
+
`bundle exec rspec`. Specs tagged `:browser` are skipped automatically when
|
|
96
|
+
Chrome is not installed, so the suite passes on a bare machine.
|
|
97
|
+
|
|
98
|
+
## License
|
|
99
|
+
|
|
100
|
+
Available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Example model adapter for Flunky::Agent. This lives outside the gem core on
|
|
4
|
+
# purpose: Flunky never talks to a model vendor itself, so the caller supplies
|
|
5
|
+
# an object that responds to #call(messages:, tools:) and returns
|
|
6
|
+
# { text:, tool_calls: [{ id:, name:, arguments: }] }.
|
|
7
|
+
#
|
|
8
|
+
# It uses net/http rather than a vendor SDK so the example adds no dependency.
|
|
9
|
+
# The OpenAI variant would wrap each tool definition as
|
|
10
|
+
# { type: "function", function: { **defn, parameters: defn[:input_schema] } }
|
|
11
|
+
# and otherwise map messages to that API's shape.
|
|
12
|
+
|
|
13
|
+
require "net/http"
|
|
14
|
+
require "json"
|
|
15
|
+
require "flunky"
|
|
16
|
+
|
|
17
|
+
class AnthropicAgent
|
|
18
|
+
ENDPOINT = URI("https://api.anthropic.com/v1/messages")
|
|
19
|
+
|
|
20
|
+
def initialize(api_key: ENV.fetch("ANTHROPIC_API_KEY"), model: "claude-opus-4-8", max_tokens: 4096)
|
|
21
|
+
@api_key = api_key
|
|
22
|
+
@model = model
|
|
23
|
+
@max_tokens = max_tokens
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Flunky's tool schemas already match Anthropic's input_schema, so tools pass
|
|
27
|
+
# through unchanged. We translate Flunky's normalized messages into the
|
|
28
|
+
# assistant/tool_result turn structure Anthropic requires, then map the reply
|
|
29
|
+
# back to the normalized { text:, tool_calls: } shape the Agent loop expects.
|
|
30
|
+
def call(messages:, tools:)
|
|
31
|
+
body = {
|
|
32
|
+
model: @model,
|
|
33
|
+
max_tokens: @max_tokens,
|
|
34
|
+
tools: tools,
|
|
35
|
+
messages: messages.map { |m| to_anthropic(m) }
|
|
36
|
+
}
|
|
37
|
+
parse(post(body))
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
private
|
|
41
|
+
|
|
42
|
+
def to_anthropic(message)
|
|
43
|
+
case message[:role]
|
|
44
|
+
when "assistant"
|
|
45
|
+
blocks = []
|
|
46
|
+
blocks << { type: "text", text: message[:text] } if message[:text].to_s != ""
|
|
47
|
+
(message[:tool_calls] || []).each do |tc|
|
|
48
|
+
blocks << { type: "tool_use", id: tc[:id], name: tc[:name], input: tc[:arguments] || {} }
|
|
49
|
+
end
|
|
50
|
+
{ role: "assistant", content: blocks }
|
|
51
|
+
when "tool"
|
|
52
|
+
results = message[:content].map do |r|
|
|
53
|
+
{ type: "tool_result", tool_use_id: r[:tool_call_id], content: r[:output].to_json }
|
|
54
|
+
end
|
|
55
|
+
{ role: "user", content: results }
|
|
56
|
+
else
|
|
57
|
+
{ role: "user", content: message[:content] }
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def parse(response)
|
|
62
|
+
text = response["content"].select { |b| b["type"] == "text" }.map { |b| b["text"] }.join
|
|
63
|
+
tool_calls = response["content"].select { |b| b["type"] == "tool_use" }.map do |b|
|
|
64
|
+
{ id: b["id"], name: b["name"], arguments: b["input"] }
|
|
65
|
+
end
|
|
66
|
+
{ text: text, tool_calls: tool_calls }
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def post(body)
|
|
70
|
+
request = Net::HTTP::Post.new(ENDPOINT)
|
|
71
|
+
request["x-api-key"] = @api_key
|
|
72
|
+
request["anthropic-version"] = "2023-06-01"
|
|
73
|
+
request["content-type"] = "application/json"
|
|
74
|
+
request.body = body.to_json
|
|
75
|
+
|
|
76
|
+
http = Net::HTTP.new(ENDPOINT.host, ENDPOINT.port)
|
|
77
|
+
http.use_ssl = true
|
|
78
|
+
JSON.parse(http.request(request).body)
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Run a quick goal against a live page when invoked directly.
|
|
83
|
+
if __FILE__ == $PROGRAM_NAME
|
|
84
|
+
Flunky.session do |session|
|
|
85
|
+
agent = Flunky::Agent.new(session, model: AnthropicAgent.new)
|
|
86
|
+
session.visit("https://example.com")
|
|
87
|
+
agent.run("Find and click the more information link.", max_steps: 5)
|
|
88
|
+
end
|
|
89
|
+
end
|
data/flunky.gemspec
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "lib/flunky/version"
|
|
4
|
+
|
|
5
|
+
Gem::Specification.new do |spec|
|
|
6
|
+
spec.name = "flunky"
|
|
7
|
+
spec.version = Flunky::VERSION
|
|
8
|
+
spec.authors = ["alexrupom"]
|
|
9
|
+
spec.email = ["alexrupom@hotmail.com"]
|
|
10
|
+
|
|
11
|
+
spec.summary = "Let any AI agent drive a real browser."
|
|
12
|
+
spec.description = "Flunky gives AI agents a real browser: a swappable driver, a page-to-prompt " \
|
|
13
|
+
"snapshot, a Capybara-style action DSL, and vendor-neutral tool schemas plus a " \
|
|
14
|
+
"dispatcher. The gem never calls an AI vendor itself; model clients are injected."
|
|
15
|
+
spec.homepage = "https://github.com/alexrupom/flunky"
|
|
16
|
+
spec.license = "MIT"
|
|
17
|
+
spec.required_ruby_version = ">= 3.0.0"
|
|
18
|
+
|
|
19
|
+
spec.metadata["rubygems_mfa_required"] = "true"
|
|
20
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
|
21
|
+
spec.metadata["source_code_uri"] = spec.homepage
|
|
22
|
+
spec.metadata["changelog_uri"] = "#{spec.homepage}/blob/main/CHANGELOG.md"
|
|
23
|
+
|
|
24
|
+
# Specify which files should be added to the gem when it is released.
|
|
25
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
|
26
|
+
spec.files = Dir.chdir(__dir__) do
|
|
27
|
+
`git ls-files -z`.split("\x0").reject do |f|
|
|
28
|
+
(f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
spec.bindir = "exe"
|
|
32
|
+
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
|
33
|
+
spec.require_paths = ["lib"]
|
|
34
|
+
|
|
35
|
+
# Default driver. Kept swappable behind Drivers::Base, but Ferrum ships out of the box
|
|
36
|
+
# because it speaks Chrome DevTools directly with no Selenium server to manage.
|
|
37
|
+
spec.add_dependency "ferrum", ">= 0.14"
|
|
38
|
+
|
|
39
|
+
# For more information and examples about making a new gem, check out our
|
|
40
|
+
# guide at: https://bundler.io/guides/creating_gem.html
|
|
41
|
+
end
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Flunky
|
|
4
|
+
# The human-readable action DSL over a driver. Each mutating action returns a
|
|
5
|
+
# small result hash so the tool layer can report outcomes uniformly.
|
|
6
|
+
#
|
|
7
|
+
# Actions also holds a reference to its session because field resolution
|
|
8
|
+
# (fill_in by label) reads the session's current snapshot, the same way a
|
|
9
|
+
# person finds a field by the label printed next to it.
|
|
10
|
+
class Actions
|
|
11
|
+
SCROLL_STEP = 600
|
|
12
|
+
|
|
13
|
+
def initialize(driver, session)
|
|
14
|
+
@driver = driver
|
|
15
|
+
@session = session
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def navigate(url)
|
|
19
|
+
@driver.go_to(url)
|
|
20
|
+
ok("navigated to #{url}")
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def click(ref)
|
|
24
|
+
@driver.click(ref)
|
|
25
|
+
ok("clicked [#{ref}]")
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def type(ref, text)
|
|
29
|
+
@driver.type_text(ref, text, clear: true)
|
|
30
|
+
ok(%(typed "#{text}" into [#{ref}]))
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Resolve a field by its label or placeholder, then type into it.
|
|
34
|
+
def fill_in(label, with:)
|
|
35
|
+
ref = field_ref(label)
|
|
36
|
+
@driver.type_text(ref, with, clear: true)
|
|
37
|
+
ok(%(filled "#{label}" with "#{with}"))
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def select(value, ref:)
|
|
41
|
+
@driver.select_option(ref, value)
|
|
42
|
+
ok(%(selected "#{value}" in [#{ref}]))
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def check(ref)
|
|
46
|
+
@driver.click(ref)
|
|
47
|
+
ok("checked [#{ref}]")
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def press(key)
|
|
51
|
+
@driver.press_key(key)
|
|
52
|
+
ok("pressed #{key}")
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def scroll(direction)
|
|
56
|
+
dx, dy = scroll_delta(direction)
|
|
57
|
+
@driver.scroll_by(dx, dy)
|
|
58
|
+
ok("scrolled #{direction}")
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def current_url
|
|
62
|
+
@driver.current_url
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def screenshot
|
|
66
|
+
@driver.screenshot_base64
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
private
|
|
70
|
+
|
|
71
|
+
def ok(message)
|
|
72
|
+
{ ok: true, message: message }
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Match against accessible name first, then placeholder, case-insensitively.
|
|
76
|
+
def field_ref(label)
|
|
77
|
+
needle = label.to_s.strip.downcase
|
|
78
|
+
element = @session.snapshot.elements.find do |el|
|
|
79
|
+
matches?(el[:name], needle) || matches?(el[:placeholder], needle)
|
|
80
|
+
end
|
|
81
|
+
raise ElementNotFound, "no field matching #{label.inspect}" if element.nil?
|
|
82
|
+
|
|
83
|
+
element[:ref]
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def matches?(value, needle)
|
|
87
|
+
!value.nil? && value.to_s.strip.downcase == needle
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def scroll_delta(direction)
|
|
91
|
+
case direction.to_s
|
|
92
|
+
when "up" then [0, -SCROLL_STEP]
|
|
93
|
+
when "down" then [0, SCROLL_STEP]
|
|
94
|
+
when "left" then [-SCROLL_STEP, 0]
|
|
95
|
+
when "right" then [SCROLL_STEP, 0]
|
|
96
|
+
else raise NotSupported, "unknown scroll direction #{direction.inspect}"
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
end
|
data/lib/flunky/agent.rb
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Flunky
|
|
4
|
+
# Optional observe/decide/act loop. The model is injected and must respond to
|
|
5
|
+
# #call(messages:, tools:) returning { text:, tool_calls: [{ id:, name:,
|
|
6
|
+
# arguments: }] }. Keeping the contract vendor-neutral is the whole point: the
|
|
7
|
+
# gem never talks to a model vendor, the caller's adapter does.
|
|
8
|
+
class Agent
|
|
9
|
+
def initialize(session, model:)
|
|
10
|
+
@session = session
|
|
11
|
+
@model = model
|
|
12
|
+
@tools = Tools.new(session)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# Drive toward +goal+, stopping when the model stops asking for tools or the
|
|
16
|
+
# step budget runs out. Returns the transcript of messages exchanged.
|
|
17
|
+
def run(goal, max_steps: 10)
|
|
18
|
+
messages = [{ role: "user", content: seed_prompt(goal) }]
|
|
19
|
+
|
|
20
|
+
max_steps.times do
|
|
21
|
+
response = @model.call(messages: messages, tools: @tools.definitions)
|
|
22
|
+
messages << { role: "assistant", content: response[:text], tool_calls: response[:tool_calls] }
|
|
23
|
+
|
|
24
|
+
calls = response[:tool_calls] || []
|
|
25
|
+
break if calls.empty?
|
|
26
|
+
|
|
27
|
+
results = calls.map { |call| run_tool(call) }
|
|
28
|
+
messages << { role: "tool", content: results }
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
messages
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
private
|
|
35
|
+
|
|
36
|
+
def run_tool(call)
|
|
37
|
+
output = @tools.dispatch(call[:name], call[:arguments] || {})
|
|
38
|
+
{ tool_call_id: call[:id], name: call[:name], output: output }
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def seed_prompt(goal)
|
|
42
|
+
<<~PROMPT
|
|
43
|
+
Goal: #{goal}
|
|
44
|
+
|
|
45
|
+
You control a web browser through the provided tools. Use the element
|
|
46
|
+
refs from the page snapshot below. Call tools until the goal is met,
|
|
47
|
+
then stop.
|
|
48
|
+
|
|
49
|
+
#{@session.snapshot.to_prompt}
|
|
50
|
+
PROMPT
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Flunky
|
|
4
|
+
# Process-wide defaults. A Session merges these with its own keyword options,
|
|
5
|
+
# so per-session settings always win over the global configuration.
|
|
6
|
+
class Configuration
|
|
7
|
+
attr_accessor :headless, :default_timeout, :max_elements, :window_size, :user_agent,
|
|
8
|
+
:browser_options, :process_timeout
|
|
9
|
+
|
|
10
|
+
def initialize
|
|
11
|
+
@headless = true
|
|
12
|
+
@default_timeout = 10
|
|
13
|
+
@max_elements = 200
|
|
14
|
+
@window_size = [1280, 800]
|
|
15
|
+
@user_agent = nil
|
|
16
|
+
@browser_options = {}
|
|
17
|
+
@process_timeout = nil
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def to_h
|
|
21
|
+
{
|
|
22
|
+
headless: headless,
|
|
23
|
+
default_timeout: default_timeout,
|
|
24
|
+
max_elements: max_elements,
|
|
25
|
+
window_size: window_size,
|
|
26
|
+
user_agent: user_agent,
|
|
27
|
+
browser_options: browser_options,
|
|
28
|
+
process_timeout: process_timeout
|
|
29
|
+
}
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
class << self
|
|
34
|
+
def configuration
|
|
35
|
+
@configuration ||= Configuration.new
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def configure
|
|
39
|
+
yield configuration if block_given?
|
|
40
|
+
configuration
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Mainly here so specs can start from a clean slate.
|
|
44
|
+
def reset_configuration!
|
|
45
|
+
@configuration = Configuration.new
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Flunky
|
|
4
|
+
module Drivers
|
|
5
|
+
# The backend contract. A driver is the only place that knows how to talk to
|
|
6
|
+
# a real browser, so swapping backends (Ferrum, a remote CDP service, a stub
|
|
7
|
+
# in tests) means implementing this interface and nothing else.
|
|
8
|
+
#
|
|
9
|
+
# Every interactive method takes a +ref+: the integer the snapshot stamped on
|
|
10
|
+
# an element as a data-ai-ref attribute. Turning that ref back into a live
|
|
11
|
+
# node is the driver's job.
|
|
12
|
+
class Base
|
|
13
|
+
def start
|
|
14
|
+
not_supported(:start)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def quit
|
|
18
|
+
not_supported(:quit)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def go_to(_url)
|
|
22
|
+
not_supported(:go_to)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def current_url
|
|
26
|
+
not_supported(:current_url)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def title
|
|
30
|
+
not_supported(:title)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def html
|
|
34
|
+
not_supported(:html)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Run JS in the page and return its value.
|
|
38
|
+
def evaluate(_js)
|
|
39
|
+
not_supported(:evaluate)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def screenshot_base64
|
|
43
|
+
not_supported(:screenshot_base64)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def click(_ref)
|
|
47
|
+
not_supported(:click)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def type_text(_ref, _text, clear: false)
|
|
51
|
+
not_supported(:type_text)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def select_option(_ref, _value)
|
|
55
|
+
not_supported(:select_option)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def press_key(_key)
|
|
59
|
+
not_supported(:press_key)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def scroll_by(_dx, _dy)
|
|
63
|
+
not_supported(:scroll_by)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
private
|
|
67
|
+
|
|
68
|
+
def not_supported(method_name)
|
|
69
|
+
raise NotSupported, "#{self.class} does not implement ##{method_name}"
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ferrum"
|
|
4
|
+
|
|
5
|
+
module Flunky
|
|
6
|
+
module Drivers
|
|
7
|
+
# Default backend. Ferrum drives Chrome over the DevTools Protocol with no
|
|
8
|
+
# Selenium server in the loop, which keeps setup to "have Chrome installed".
|
|
9
|
+
class FerrumDriver < Base
|
|
10
|
+
# Named keys Ferrum expects as symbols; anything else is typed literally.
|
|
11
|
+
KEY_MAP = {
|
|
12
|
+
"Enter" => :Enter,
|
|
13
|
+
"Tab" => :Tab,
|
|
14
|
+
"Escape" => :Escape,
|
|
15
|
+
"Backspace" => :Backspace,
|
|
16
|
+
"Delete" => :Delete,
|
|
17
|
+
"ArrowUp" => :Up,
|
|
18
|
+
"ArrowDown" => :Down,
|
|
19
|
+
"ArrowLeft" => :Left,
|
|
20
|
+
"ArrowRight" => :Right
|
|
21
|
+
}.freeze
|
|
22
|
+
|
|
23
|
+
attr_reader :browser
|
|
24
|
+
|
|
25
|
+
# browser_options passes raw Chrome flags through to Ferrum (for example
|
|
26
|
+
# { "no-sandbox" => nil } when running as root in CI). process_timeout is
|
|
27
|
+
# how long to wait for Chrome to come up; it is separate from the per
|
|
28
|
+
# command timeout.
|
|
29
|
+
def initialize(headless: true, window_size: [1280, 800], default_timeout: 10,
|
|
30
|
+
user_agent: nil, browser_options: {}, process_timeout: nil)
|
|
31
|
+
super()
|
|
32
|
+
@headless = headless
|
|
33
|
+
@window_size = window_size
|
|
34
|
+
@default_timeout = default_timeout
|
|
35
|
+
@user_agent = user_agent
|
|
36
|
+
@browser_options = browser_options || {}
|
|
37
|
+
@process_timeout = process_timeout
|
|
38
|
+
@browser = nil
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def start
|
|
42
|
+
return @browser if @browser
|
|
43
|
+
|
|
44
|
+
options = {
|
|
45
|
+
headless: @headless,
|
|
46
|
+
window_size: @window_size,
|
|
47
|
+
timeout: @default_timeout,
|
|
48
|
+
browser_options: @browser_options
|
|
49
|
+
}
|
|
50
|
+
options[:process_timeout] = @process_timeout if @process_timeout
|
|
51
|
+
@browser = Ferrum::Browser.new(**options)
|
|
52
|
+
@browser.headers.set("User-Agent" => @user_agent) if @user_agent
|
|
53
|
+
@browser
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def quit
|
|
57
|
+
@browser&.quit
|
|
58
|
+
@browser = nil
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def go_to(url)
|
|
62
|
+
page.go_to(url)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def current_url
|
|
66
|
+
page.current_url
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def title
|
|
70
|
+
page.title
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def html
|
|
74
|
+
page.body
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def evaluate(js)
|
|
78
|
+
page.evaluate(js)
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Side-effecting JS where we do not care about the return value.
|
|
82
|
+
def execute(js)
|
|
83
|
+
page.execute(js)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def screenshot_base64
|
|
87
|
+
page.screenshot(encoding: :base64)
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def click(ref)
|
|
91
|
+
resolve(ref).click
|
|
92
|
+
true
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def type_text(ref, text, clear: false)
|
|
96
|
+
node = resolve(ref)
|
|
97
|
+
node.focus
|
|
98
|
+
# Clear by emptying the field's value rather than sending backspaces,
|
|
99
|
+
# which is both faster and robust to long existing contents.
|
|
100
|
+
node.evaluate("this.value = ''") if clear
|
|
101
|
+
node.type(text)
|
|
102
|
+
true
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def select_option(ref, value)
|
|
106
|
+
node = resolve(ref)
|
|
107
|
+
if tag_name(node) == "select"
|
|
108
|
+
node.select(value)
|
|
109
|
+
else
|
|
110
|
+
# Custom (non-native) selects are just clickable elements.
|
|
111
|
+
node.click
|
|
112
|
+
end
|
|
113
|
+
true
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def press_key(key)
|
|
117
|
+
page.keyboard.type(KEY_MAP.fetch(key, key))
|
|
118
|
+
true
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def scroll_by(dx, dy)
|
|
122
|
+
page.execute("window.scrollBy(#{Integer(dx)}, #{Integer(dy)})")
|
|
123
|
+
true
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
private
|
|
127
|
+
|
|
128
|
+
def page
|
|
129
|
+
start
|
|
130
|
+
@browser.page
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Integer(ref) both validates the ref and stops anything from smuggling a
|
|
134
|
+
# CSS fragment into the selector.
|
|
135
|
+
def resolve(ref)
|
|
136
|
+
node = page.at_css(%([data-ai-ref="#{Integer(ref)}"]))
|
|
137
|
+
raise ElementNotFound, "no element with ref #{ref} on the current page" if node.nil?
|
|
138
|
+
|
|
139
|
+
node
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def tag_name(node)
|
|
143
|
+
node.evaluate("this.tagName")&.downcase
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
end
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Flunky
|
|
4
|
+
# Base class so callers can rescue every Flunky failure with one rescue.
|
|
5
|
+
class Error < StandardError; end
|
|
6
|
+
|
|
7
|
+
# A ref (or label) did not resolve to a live node on the current page.
|
|
8
|
+
class ElementNotFound < Error; end
|
|
9
|
+
|
|
10
|
+
# The dispatcher was handed a tool name it does not know about.
|
|
11
|
+
class UnknownTool < Error; end
|
|
12
|
+
|
|
13
|
+
# A driver was asked to do something its backend cannot do.
|
|
14
|
+
class NotSupported < Error; end
|
|
15
|
+
end
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
(function () {
|
|
2
|
+
// Runs in the page and returns { url, title, elements: [...] }.
|
|
3
|
+
//
|
|
4
|
+
// Why this exists: an agent cannot reason about raw HTML, so we reduce the
|
|
5
|
+
// page to the handful of things it can act on, each stamped with a stable
|
|
6
|
+
// integer ref (data-ai-ref) that the Ruby side later uses to address it.
|
|
7
|
+
//
|
|
8
|
+
// This file must begin with the IIFE expression: the driver evaluates it as
|
|
9
|
+
// `return <file>`, so a leading comment would trigger JS semicolon insertion
|
|
10
|
+
// and run `return;` before the function ever executes.
|
|
11
|
+
|
|
12
|
+
// Drop refs from a previous snapshot so numbering restarts from 1 every time
|
|
13
|
+
// and never collides with stale attributes.
|
|
14
|
+
document.querySelectorAll("[data-ai-ref]").forEach(function (el) {
|
|
15
|
+
el.removeAttribute("data-ai-ref");
|
|
16
|
+
});
|
|
17
|
+
|
|
18
|
+
var SELECTOR = [
|
|
19
|
+
"a", "button", "input", "select", "textarea",
|
|
20
|
+
"[role=button]", "[role=link]", "[role=checkbox]", "[role=tab]", "[role=menuitem]",
|
|
21
|
+
"[contenteditable=true]", "[onclick]"
|
|
22
|
+
].join(",");
|
|
23
|
+
|
|
24
|
+
var max = window.__AI_MAX__ || 200;
|
|
25
|
+
|
|
26
|
+
function isVisible(el) {
|
|
27
|
+
var style = window.getComputedStyle(el);
|
|
28
|
+
if (style.display === "none" || style.visibility === "hidden") return false;
|
|
29
|
+
if (parseFloat(style.opacity) === 0) return false;
|
|
30
|
+
var rect = el.getBoundingClientRect();
|
|
31
|
+
return rect.width > 0 && rect.height > 0;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
function trimmed(text) {
|
|
35
|
+
return (text || "").replace(/\s+/g, " ").trim();
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// Best-effort accessible name, cheapest reliable source first.
|
|
39
|
+
function accessibleName(el) {
|
|
40
|
+
var label = el.getAttribute("aria-label");
|
|
41
|
+
if (trimmed(label)) return trimmed(label);
|
|
42
|
+
|
|
43
|
+
var labelledby = el.getAttribute("aria-labelledby");
|
|
44
|
+
if (labelledby) {
|
|
45
|
+
var parts = labelledby.split(/\s+/).map(function (id) {
|
|
46
|
+
var ref = document.getElementById(id);
|
|
47
|
+
return ref ? trimmed(ref.textContent) : "";
|
|
48
|
+
});
|
|
49
|
+
var joined = trimmed(parts.join(" "));
|
|
50
|
+
if (joined) return joined;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
if (el.id) {
|
|
54
|
+
var forLabel = document.querySelector('label[for="' + el.id + '"]');
|
|
55
|
+
if (forLabel && trimmed(forLabel.textContent)) return trimmed(forLabel.textContent);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
var placeholder = el.getAttribute("placeholder");
|
|
59
|
+
if (trimmed(placeholder)) return trimmed(placeholder);
|
|
60
|
+
|
|
61
|
+
var alt = el.getAttribute("alt");
|
|
62
|
+
if (trimmed(alt)) return trimmed(alt);
|
|
63
|
+
|
|
64
|
+
var title = el.getAttribute("title");
|
|
65
|
+
if (trimmed(title)) return trimmed(title);
|
|
66
|
+
|
|
67
|
+
return trimmed(el.textContent);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
var elements = [];
|
|
71
|
+
var ref = 0;
|
|
72
|
+
var candidates = document.querySelectorAll(SELECTOR);
|
|
73
|
+
|
|
74
|
+
for (var i = 0; i < candidates.length && elements.length < max; i++) {
|
|
75
|
+
var el = candidates[i];
|
|
76
|
+
if (!isVisible(el)) continue;
|
|
77
|
+
|
|
78
|
+
ref += 1;
|
|
79
|
+
el.setAttribute("data-ai-ref", String(ref));
|
|
80
|
+
|
|
81
|
+
var rect = el.getBoundingClientRect();
|
|
82
|
+
elements.push({
|
|
83
|
+
ref: ref,
|
|
84
|
+
tag: el.tagName.toLowerCase(),
|
|
85
|
+
type: el.getAttribute("type") || null,
|
|
86
|
+
role: el.getAttribute("role") || null,
|
|
87
|
+
name: accessibleName(el),
|
|
88
|
+
value: el.value != null ? el.value : null,
|
|
89
|
+
placeholder: el.getAttribute("placeholder") || null,
|
|
90
|
+
checked: typeof el.checked === "boolean" ? el.checked : null,
|
|
91
|
+
disabled: typeof el.disabled === "boolean" ? el.disabled : null,
|
|
92
|
+
x: Math.round(rect.left),
|
|
93
|
+
y: Math.round(rect.top)
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
return { url: document.location.href, title: document.title, elements: elements };
|
|
98
|
+
})();
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Flunky
|
|
4
|
+
# Owns one driver for the life of a browsing session, caches the latest
|
|
5
|
+
# snapshot, and hands out the action DSL. This is the object callers hold.
|
|
6
|
+
class Session
|
|
7
|
+
attr_reader :driver
|
|
8
|
+
|
|
9
|
+
# Per-session options override the global configuration. A custom backend can
|
|
10
|
+
# be injected with +driver:+; otherwise we build a Ferrum driver.
|
|
11
|
+
def initialize(driver: nil, **opts)
|
|
12
|
+
@options = Flunky.configuration.to_h.merge(opts)
|
|
13
|
+
@driver = driver || build_default_driver
|
|
14
|
+
@driver.start
|
|
15
|
+
@snapshot = nil
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def visit(url)
|
|
19
|
+
@driver.go_to(url)
|
|
20
|
+
@snapshot = nil
|
|
21
|
+
self
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# Re-read the page and cache the fresh snapshot.
|
|
25
|
+
def observe
|
|
26
|
+
@snapshot = Snapshot.capture(@driver, max_elements: @options[:max_elements])
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Memoized view; captures once on first use so refs exist before any action.
|
|
30
|
+
def snapshot
|
|
31
|
+
@snapshot ||= observe
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def actions
|
|
35
|
+
@actions ||= Actions.new(@driver, self)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def close
|
|
39
|
+
@driver.quit
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
private
|
|
43
|
+
|
|
44
|
+
def build_default_driver
|
|
45
|
+
Drivers::FerrumDriver.new(
|
|
46
|
+
headless: @options[:headless],
|
|
47
|
+
window_size: @options[:window_size],
|
|
48
|
+
default_timeout: @options[:default_timeout],
|
|
49
|
+
user_agent: @options[:user_agent],
|
|
50
|
+
browser_options: @options[:browser_options] || {},
|
|
51
|
+
process_timeout: @options[:process_timeout]
|
|
52
|
+
)
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Flunky
|
|
4
|
+
# A reduced, model-readable view of the current page: the page's url and title
|
|
5
|
+
# plus the list of elements the agent can act on, each carrying its ref.
|
|
6
|
+
class Snapshot
|
|
7
|
+
JS_PATH = File.expand_path("js/snapshot.js", __dir__)
|
|
8
|
+
|
|
9
|
+
attr_reader :url, :title, :elements
|
|
10
|
+
|
|
11
|
+
# Run the snapshot script against a live driver and wrap the result.
|
|
12
|
+
def self.capture(driver, max_elements: Flunky.configuration.max_elements)
|
|
13
|
+
driver.evaluate("window.__AI_MAX__ = #{Integer(max_elements)}")
|
|
14
|
+
data = driver.evaluate(script)
|
|
15
|
+
from_h(data)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Build a Snapshot from a plain hash (string or symbol keys). Handy for the
|
|
19
|
+
# capture path and for testing without a browser.
|
|
20
|
+
def self.from_h(data)
|
|
21
|
+
h = symbolize(data)
|
|
22
|
+
new(url: h[:url], title: h[:title], elements: (h[:elements] || []).map { |e| symbolize(e) })
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def self.script
|
|
26
|
+
@script ||= File.read(JS_PATH)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def self.symbolize(hash)
|
|
30
|
+
hash.each_with_object({}) { |(k, v), acc| acc[k.to_sym] = v }
|
|
31
|
+
end
|
|
32
|
+
private_class_method :symbolize
|
|
33
|
+
|
|
34
|
+
def initialize(url:, title:, elements:)
|
|
35
|
+
@url = url
|
|
36
|
+
@title = title
|
|
37
|
+
@elements = elements
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Compact block the model reads. Numbered refs map straight to the integers
|
|
41
|
+
# tools expect, so the model can copy a number into a click/type call.
|
|
42
|
+
def to_prompt
|
|
43
|
+
lines = ["PAGE: #{title} (#{url})", "ELEMENTS:"]
|
|
44
|
+
lines.concat(elements.map { |el| element_line(el) })
|
|
45
|
+
lines.join("\n")
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def to_h
|
|
49
|
+
{ url: url, title: title, elements: elements }
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
private
|
|
53
|
+
|
|
54
|
+
def element_line(el)
|
|
55
|
+
header = "[#{el[:ref]}] <#{el[:tag]}#{type_suffix(el)}>"
|
|
56
|
+
[header, name_part(el), *detail_parts(el)].compact.join(" ")
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def type_suffix(el)
|
|
60
|
+
el[:type] ? " type=#{el[:type]}" : ""
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def name_part(el)
|
|
64
|
+
name = el[:name].to_s.strip
|
|
65
|
+
name.empty? ? nil : %("#{name}")
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def detail_parts(el)
|
|
69
|
+
parts = []
|
|
70
|
+
parts << %(placeholder="#{el[:placeholder]}") if present?(el[:placeholder])
|
|
71
|
+
parts << %(value="#{el[:value]}") if present?(el[:value])
|
|
72
|
+
parts << "checked" if el[:checked] == true
|
|
73
|
+
parts << "disabled" if el[:disabled] == true
|
|
74
|
+
parts
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def present?(value)
|
|
78
|
+
!value.nil? && value.to_s.strip != ""
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
data/lib/flunky/tools.rb
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Flunky
|
|
4
|
+
# Turns a session into the two things a model client needs: a list of tool
|
|
5
|
+
# schemas to advertise, and a dispatcher that runs a named tool call.
|
|
6
|
+
#
|
|
7
|
+
# The schema shape (name/description/input_schema) is what Anthropic expects
|
|
8
|
+
# directly; the example adapter shows the one-line wrap for OpenAI.
|
|
9
|
+
class Tools
|
|
10
|
+
def initialize(session)
|
|
11
|
+
@session = session
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def definitions
|
|
15
|
+
[
|
|
16
|
+
tool("navigate", "Load a URL in the browser.",
|
|
17
|
+
url: { type: "string", description: "Absolute URL to open" }),
|
|
18
|
+
tool("click", "Click the element with the given ref.",
|
|
19
|
+
ref: { type: "integer", description: "Element ref from the page snapshot" }),
|
|
20
|
+
tool("type", "Type text into the element with the given ref (clears it first).",
|
|
21
|
+
ref: { type: "integer" }, text: { type: "string" }),
|
|
22
|
+
tool("fill_in", "Fill a field found by its label or placeholder text.",
|
|
23
|
+
label: { type: "string" }, text: { type: "string" }),
|
|
24
|
+
tool("select", "Select an option in a dropdown element.",
|
|
25
|
+
ref: { type: "integer" }, value: { type: "string" }),
|
|
26
|
+
tool("press", "Press a key, for example Enter or Tab.",
|
|
27
|
+
key: { type: "string" }),
|
|
28
|
+
tool("scroll", "Scroll the page in a direction.",
|
|
29
|
+
direction: { type: "string", enum: %w[up down left right] }),
|
|
30
|
+
tool("read_page", "Re-read the page and return a fresh snapshot.")
|
|
31
|
+
]
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Run a tool by name, normalize the result, and fold a fresh page view in so
|
|
35
|
+
# the model always sees the outcome of what it just did.
|
|
36
|
+
def dispatch(name, args = {})
|
|
37
|
+
args = symbolize(args)
|
|
38
|
+
result =
|
|
39
|
+
case name.to_s
|
|
40
|
+
when "navigate" then @session.actions.navigate(args[:url])
|
|
41
|
+
when "click" then @session.actions.click(args[:ref])
|
|
42
|
+
when "type" then @session.actions.type(args[:ref], args[:text])
|
|
43
|
+
when "fill_in" then @session.actions.fill_in(args[:label], with: args[:text])
|
|
44
|
+
when "select" then @session.actions.select(args[:value], ref: args[:ref])
|
|
45
|
+
when "press" then @session.actions.press(args[:key])
|
|
46
|
+
when "scroll" then @session.actions.scroll(args[:direction])
|
|
47
|
+
when "read_page" then { ok: true, message: "read page" }
|
|
48
|
+
else raise UnknownTool, "unknown tool #{name.inspect}"
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
result.merge(page: @session.observe.to_prompt)
|
|
52
|
+
rescue Flunky::Error => e
|
|
53
|
+
raise if e.is_a?(UnknownTool)
|
|
54
|
+
|
|
55
|
+
{ ok: false, error: e.message }
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
private
|
|
59
|
+
|
|
60
|
+
def tool(name, description, **properties)
|
|
61
|
+
{
|
|
62
|
+
name: name,
|
|
63
|
+
description: description,
|
|
64
|
+
input_schema: {
|
|
65
|
+
type: "object",
|
|
66
|
+
properties: properties,
|
|
67
|
+
required: properties.keys.map(&:to_s)
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def symbolize(args)
|
|
73
|
+
(args || {}).each_with_object({}) { |(k, v), acc| acc[k.to_sym] = v }
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
data/lib/flunky.rb
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "flunky/version"
|
|
4
|
+
require_relative "flunky/errors"
|
|
5
|
+
require_relative "flunky/configuration"
|
|
6
|
+
require_relative "flunky/drivers/base"
|
|
7
|
+
require_relative "flunky/drivers/ferrum_driver"
|
|
8
|
+
require_relative "flunky/snapshot"
|
|
9
|
+
require_relative "flunky/actions"
|
|
10
|
+
require_relative "flunky/session"
|
|
11
|
+
require_relative "flunky/tools"
|
|
12
|
+
require_relative "flunky/agent"
|
|
13
|
+
|
|
14
|
+
module Flunky
|
|
15
|
+
# Convenience entry point: open a session, hand it to the block, and always
|
|
16
|
+
# close the browser even if the block raises.
|
|
17
|
+
def self.session(**opts)
|
|
18
|
+
session = Session.new(**opts)
|
|
19
|
+
yield session if block_given?
|
|
20
|
+
session
|
|
21
|
+
ensure
|
|
22
|
+
session&.close
|
|
23
|
+
end
|
|
24
|
+
end
|
data/sig/flunky.rbs
ADDED
metadata
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: flunky
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- alexrupom
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: exe
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2026-06-18 00:00:00.000000000 Z
|
|
12
|
+
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: ferrum
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - ">="
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '0.14'
|
|
20
|
+
type: :runtime
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - ">="
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '0.14'
|
|
27
|
+
description: 'Flunky gives AI agents a real browser: a swappable driver, a page-to-prompt
|
|
28
|
+
snapshot, a Capybara-style action DSL, and vendor-neutral tool schemas plus a dispatcher.
|
|
29
|
+
The gem never calls an AI vendor itself; model clients are injected.'
|
|
30
|
+
email:
|
|
31
|
+
- alexrupom@hotmail.com
|
|
32
|
+
executables: []
|
|
33
|
+
extensions: []
|
|
34
|
+
extra_rdoc_files: []
|
|
35
|
+
files:
|
|
36
|
+
- ".rspec"
|
|
37
|
+
- ".rubocop.yml"
|
|
38
|
+
- ".ruby-version"
|
|
39
|
+
- CHANGELOG.md
|
|
40
|
+
- CODE_OF_CONDUCT.md
|
|
41
|
+
- Gemfile
|
|
42
|
+
- LICENSE.txt
|
|
43
|
+
- README.md
|
|
44
|
+
- Rakefile
|
|
45
|
+
- examples/anthropic_agent.rb
|
|
46
|
+
- flunky.gemspec
|
|
47
|
+
- lib/flunky.rb
|
|
48
|
+
- lib/flunky/actions.rb
|
|
49
|
+
- lib/flunky/agent.rb
|
|
50
|
+
- lib/flunky/configuration.rb
|
|
51
|
+
- lib/flunky/drivers/base.rb
|
|
52
|
+
- lib/flunky/drivers/ferrum_driver.rb
|
|
53
|
+
- lib/flunky/errors.rb
|
|
54
|
+
- lib/flunky/js/snapshot.js
|
|
55
|
+
- lib/flunky/session.rb
|
|
56
|
+
- lib/flunky/snapshot.rb
|
|
57
|
+
- lib/flunky/tools.rb
|
|
58
|
+
- lib/flunky/version.rb
|
|
59
|
+
- sig/flunky.rbs
|
|
60
|
+
homepage: https://github.com/alexrupom/flunky
|
|
61
|
+
licenses:
|
|
62
|
+
- MIT
|
|
63
|
+
metadata:
|
|
64
|
+
rubygems_mfa_required: 'true'
|
|
65
|
+
homepage_uri: https://github.com/alexrupom/flunky
|
|
66
|
+
source_code_uri: https://github.com/alexrupom/flunky
|
|
67
|
+
changelog_uri: https://github.com/alexrupom/flunky/blob/main/CHANGELOG.md
|
|
68
|
+
post_install_message:
|
|
69
|
+
rdoc_options: []
|
|
70
|
+
require_paths:
|
|
71
|
+
- lib
|
|
72
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
73
|
+
requirements:
|
|
74
|
+
- - ">="
|
|
75
|
+
- !ruby/object:Gem::Version
|
|
76
|
+
version: 3.0.0
|
|
77
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
78
|
+
requirements:
|
|
79
|
+
- - ">="
|
|
80
|
+
- !ruby/object:Gem::Version
|
|
81
|
+
version: '0'
|
|
82
|
+
requirements: []
|
|
83
|
+
rubygems_version: 3.5.22
|
|
84
|
+
signing_key:
|
|
85
|
+
specification_version: 4
|
|
86
|
+
summary: Let any AI agent drive a real browser.
|
|
87
|
+
test_files: []
|