agent_ferrum 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,82 @@
1
+ # frozen_string_literal: true
2
+
3
+ module AgentFerrum
4
+ module Content
5
+ class AccessibilityTree
6
+ INTERACTIVE_ROLES = %w[
7
+ button link textbox checkbox radio combobox
8
+ menuitem tab slider spinbutton searchbox switch
9
+ option listbox menu menubar
10
+ ].freeze
11
+
12
+ attr_reader :refs, :nodes
13
+
14
+ def initialize(browser)
15
+ @browser = browser
16
+ @refs = {}
17
+ @nodes = []
18
+ extract!
19
+ end
20
+
21
+ def to_s
22
+ @nodes.map { |n| format_node(n) }.join("\n")
23
+ end
24
+
25
+ private
26
+
27
+ def extract!
28
+ result = @browser.ferrum.page.command("Accessibility.getFullAXTree")
29
+ ax_nodes = result["nodes"]
30
+
31
+ ref_counter = 0
32
+ ax_nodes.each do |ax_node|
33
+ role = ax_node.dig("role", "value")
34
+ next unless INTERACTIVE_ROLES.include?(role)
35
+ next if ignored?(ax_node)
36
+
37
+ ref_counter += 1
38
+ ref = "@e#{ref_counter}"
39
+
40
+ node_info = {
41
+ ref: ref,
42
+ role: role,
43
+ name: ax_node.dig("name", "value") || "",
44
+ value: ax_node.dig("value", "value"),
45
+ description: ax_node.dig("description", "value"),
46
+ backend_node_id: ax_node["backendDOMNodeId"],
47
+ properties: extract_properties(ax_node)
48
+ }
49
+
50
+ @refs[ref] = node_info
51
+ @nodes << node_info
52
+ end
53
+ end
54
+
55
+ def ignored?(ax_node)
56
+ ignored = ax_node["ignored"]
57
+ case ignored
58
+ when Hash then ignored["value"] == true
59
+ when true then true
60
+ else false
61
+ end
62
+ end
63
+
64
+ def extract_properties(ax_node)
65
+ props = {}
66
+ (ax_node["properties"] || []).each do |prop|
67
+ name = prop["name"]
68
+ value = prop.dig("value", "value")
69
+ props[name] = value if %w[disabled required checked selected readonly].include?(name)
70
+ end
71
+ props
72
+ end
73
+
74
+ def format_node(node)
75
+ parts = ["#{node[:ref]}: [#{node[:role]}] \"#{node[:name]}\""]
76
+ parts << "value=\"#{node[:value]}\"" if node[:value]
77
+ parts << node[:properties].map { |k, v| "#{k}=#{v}" }.join(" ") if node[:properties]&.any?
78
+ parts.join(" ")
79
+ end
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "reverse_markdown"
4
+
5
+ module AgentFerrum
6
+ module Content
7
+ class MarkdownConverter
8
+ def initialize(html)
9
+ @html = html
10
+ end
11
+
12
+ def convert
13
+ md = ReverseMarkdown.convert(@html,
14
+ unknown_tags: :bypass,
15
+ github_flavored: true)
16
+ compact(md)
17
+ end
18
+
19
+ private
20
+
21
+ def compact(markdown)
22
+ markdown
23
+ .gsub(/\n{3,}/, "\n\n")
24
+ .gsub(/^[ \t]+$/, "")
25
+ .gsub(/\[([^\]]*)\]\(\s*\)/, '\1')
26
+ .gsub(/!\[\]\([^)]*\)/, "")
27
+ .gsub(/\s+\n/, "\n")
28
+ .strip
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ module AgentFerrum
4
+ module Content
5
+ class Snapshot
6
+ attr_reader :accessibility, :markdown_content, :url, :title, :refs
7
+
8
+ def initialize(browser)
9
+ @url = browser.current_url
10
+ @title = browser.title
11
+
12
+ filtered_html = VisibilityFilter.new(browser).filtered_html
13
+ @accessibility = AccessibilityTree.new(browser)
14
+ @refs = @accessibility.refs
15
+ @markdown_content = MarkdownConverter.new(filtered_html).convert
16
+ end
17
+
18
+ def markdown
19
+ @markdown_content
20
+ end
21
+
22
+ def accessibility_tree
23
+ @accessibility.to_s
24
+ end
25
+
26
+ def to_s
27
+ <<~SNAPSHOT
28
+ # #{@title}
29
+ URL: #{@url}
30
+
31
+ ## Interactive Elements
32
+ #{@accessibility}
33
+
34
+ ## Page Content
35
+ #{@markdown_content}
36
+ SNAPSHOT
37
+ end
38
+
39
+ def estimated_tokens
40
+ to_s.length / 4
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+
5
+ module AgentFerrum
6
+ module Content
7
+ class VisibilityFilter
8
+ FILTER_SCRIPT = <<~JS
9
+ (() => {
10
+ const clone = document.body.cloneNode(true);
11
+ ['script','style','noscript','svg','path','meta','link',
12
+ 'template','iframe'].forEach(tag => {
13
+ clone.querySelectorAll(tag).forEach(el => el.remove());
14
+ });
15
+ clone.querySelectorAll('[aria-hidden="true"]').forEach(el => el.remove());
16
+ clone.querySelectorAll('[hidden]').forEach(el => el.remove());
17
+ return clone.innerHTML;
18
+ })()
19
+ JS
20
+
21
+ def initialize(browser)
22
+ @browser = browser
23
+ end
24
+
25
+ def filtered_html
26
+ raw_html = @browser.ferrum.evaluate(FILTER_SCRIPT)
27
+ post_process(raw_html)
28
+ end
29
+
30
+ private
31
+
32
+ def post_process(html)
33
+ doc = Nokogiri::HTML::DocumentFragment.parse(html)
34
+ doc.xpath("//comment()").remove
35
+ doc.traverse do |node|
36
+ next unless node.element?
37
+
38
+ %w[style class data-testid data-cy onclick onload onerror].each { |attr| node.delete(attr) }
39
+ end
40
+ doc.to_html
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,52 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fileutils"
4
+
5
+ module AgentFerrum
6
+ class Downloads
7
+ def initialize(browser)
8
+ @browser = browser
9
+ @download_path = nil
10
+ end
11
+
12
+ def download_path=(path)
13
+ @download_path = File.expand_path(path)
14
+ FileUtils.mkdir_p(@download_path)
15
+ @browser.ferrum.page.command("Browser.setDownloadBehavior",
16
+ behavior: "allow",
17
+ downloadPath: @download_path)
18
+ end
19
+
20
+ attr_reader :download_path
21
+
22
+ def wait(timeout: 30, filename: nil)
23
+ raise AgentFerrum::Error, "Download path not set. Set download_path first." unless @download_path
24
+
25
+ deadline = Process.clock_gettime(Process::CLOCK_MONOTONIC) + timeout
26
+
27
+ loop do
28
+ match = find_completed_download(filename)
29
+ return match if match && File.mtime(match) > (Time.now - timeout)
30
+
31
+ remaining = deadline - Process.clock_gettime(Process::CLOCK_MONOTONIC)
32
+ raise Waiter::TimeoutError, "Download timeout (#{timeout}s)" if remaining <= 0
33
+
34
+ sleep 0.5
35
+ end
36
+ end
37
+
38
+ private
39
+
40
+ def find_completed_download(filename)
41
+ files = Dir.glob(File.join(@download_path, "*"))
42
+ .reject { |f| f.end_with?(".crdownload", ".tmp") }
43
+ .select { |f| File.file?(f) }
44
+
45
+ if filename
46
+ files.find { |f| File.basename(f) == filename }
47
+ else
48
+ files.max_by { |f| File.mtime(f) }
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ module AgentFerrum
4
+ class Error < StandardError; end
5
+
6
+ class RefNotFoundError < Error
7
+ def initialize(ref)
8
+ super("Element ref '#{ref}' not found. Call browser.snapshot to refresh refs.")
9
+ end
10
+ end
11
+
12
+ class ElementNotFoundError < Error
13
+ def initialize(selector)
14
+ super("No element matches '#{selector}'. Check the selector or call snapshot to see available elements.")
15
+ end
16
+ end
17
+
18
+ class NavigationError < Error; end
19
+ class StealthError < Error; end
20
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ module AgentFerrum
4
+ class Node
5
+ MAX_RETRIES = 3
6
+
7
+ def initialize(ferrum_node)
8
+ @node = ferrum_node
9
+ end
10
+
11
+ def click
12
+ with_retry { @node.click }
13
+ end
14
+
15
+ def fill(value)
16
+ with_retry do
17
+ @node.focus
18
+ @node.type(value)
19
+ end
20
+ end
21
+
22
+ def select(value)
23
+ with_retry { @node.select(value) }
24
+ end
25
+
26
+ def hover
27
+ with_retry { @node.hover }
28
+ end
29
+
30
+ def focus
31
+ with_retry { @node.focus }
32
+ end
33
+
34
+ def text
35
+ @node.text
36
+ end
37
+
38
+ def value
39
+ @node.value
40
+ end
41
+
42
+ def [](attr)
43
+ @node[attr]
44
+ end
45
+
46
+ private
47
+
48
+ def with_retry
49
+ attempts = 0
50
+ begin
51
+ attempts += 1
52
+ yield
53
+ rescue Ferrum::NodeMovingError, Ferrum::CoordinatesNotFoundError
54
+ raise if attempts >= MAX_RETRIES
55
+
56
+ sleep 0.1
57
+ retry
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ module AgentFerrum
4
+ module Stealth
5
+ class Manager
6
+ SCRIPTS_DIR = File.join(__dir__, "scripts")
7
+
8
+ def apply(page, profile)
9
+ scripts = Profiles.scripts_for(profile)
10
+ scripts.each do |script_name|
11
+ js = load_script(script_name)
12
+ page.command("Page.addScriptToEvaluateOnNewDocument", source: js)
13
+ end
14
+
15
+ # Override user-agent at CDP level if included in the profile
16
+ return unless scripts.include?("user_agent")
17
+
18
+ current_ua = page.command("Runtime.evaluate", expression: "navigator.userAgent").dig("result", "value") || ""
19
+ return unless current_ua.include?("HeadlessChrome")
20
+
21
+ clean_ua = current_ua.gsub("HeadlessChrome", "Chrome")
22
+ page.command("Network.setUserAgentOverride", userAgent: clean_ua)
23
+ end
24
+
25
+ private
26
+
27
+ def load_script(name)
28
+ utils = File.read(File.join(SCRIPTS_DIR, "utils.js"))
29
+ script = File.read(File.join(SCRIPTS_DIR, "#{name}.js"))
30
+ "#{utils}\n#{script}"
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ module AgentFerrum
4
+ module Stealth
5
+ module Profiles
6
+ PROFILES = {
7
+ minimal: %w[webdriver],
8
+ moderate: %w[webdriver navigator_vendor chrome_runtime user_agent],
9
+ maximum: %w[webdriver navigator_vendor navigator_plugins chrome_runtime
10
+ webgl_vendor iframe_content_window user_agent]
11
+ }.freeze
12
+
13
+ def self.scripts_for(profile)
14
+ PROFILES.fetch(profile) do
15
+ raise StealthError, "Unknown stealth profile: #{profile}. Valid: #{PROFILES.keys.join(", ")}"
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,34 @@
1
+ // Add window.chrome.runtime to pass detection checks
2
+ if (!window.chrome) {
3
+ window.chrome = {};
4
+ }
5
+ if (!window.chrome.runtime) {
6
+ window.chrome.runtime = {
7
+ connect: function() {},
8
+ sendMessage: function() {},
9
+ id: undefined
10
+ };
11
+ }
12
+ // Also ensure chrome.csi and chrome.loadTimes exist
13
+ if (!window.chrome.csi) {
14
+ window.chrome.csi = function() { return {}; };
15
+ }
16
+ if (!window.chrome.loadTimes) {
17
+ window.chrome.loadTimes = function() {
18
+ return {
19
+ commitLoadTime: Date.now() / 1000,
20
+ connectionInfo: 'http/1.1',
21
+ finishDocumentLoadTime: Date.now() / 1000 + 0.1,
22
+ finishLoadTime: Date.now() / 1000 + 0.2,
23
+ firstPaintAfterLoadTime: 0,
24
+ firstPaintTime: Date.now() / 1000 + 0.05,
25
+ navigationType: 'Other',
26
+ npnNegotiatedProtocol: 'http/1.1',
27
+ requestTime: Date.now() / 1000 - 0.5,
28
+ startLoadTime: Date.now() / 1000 - 0.4,
29
+ wasAlternateProtocolAvailable: false,
30
+ wasFetchedViaSpdy: false,
31
+ wasNpnNegotiated: false
32
+ };
33
+ };
34
+ }
@@ -0,0 +1,22 @@
1
+ // Fix iframe contentWindow detection
2
+ try {
3
+ const originalContentWindow = Object.getOwnPropertyDescriptor(HTMLIFrameElement.prototype, 'contentWindow');
4
+ Object.defineProperty(HTMLIFrameElement.prototype, 'contentWindow', {
5
+ get: function() {
6
+ const result = originalContentWindow.get.call(this);
7
+ if (result === null) {
8
+ return result;
9
+ }
10
+ // Ensure the contentWindow has the expected chrome property
11
+ try {
12
+ if (!result.chrome) {
13
+ result.chrome = window.chrome;
14
+ }
15
+ } catch (e) {
16
+ // Cross-origin frame, ignore
17
+ }
18
+ return result;
19
+ },
20
+ configurable: true
21
+ });
22
+ } catch (e) {}
@@ -0,0 +1,64 @@
1
+ // Simulate realistic browser plugins
2
+ const mockPlugins = [
3
+ {
4
+ name: 'Chrome PDF Plugin',
5
+ description: 'Portable Document Format',
6
+ filename: 'internal-pdf-viewer',
7
+ mimeTypes: [{ type: 'application/x-google-chrome-pdf', suffixes: 'pdf', description: 'Portable Document Format' }]
8
+ },
9
+ {
10
+ name: 'Chrome PDF Viewer',
11
+ description: '',
12
+ filename: 'mhjfbmdgcfjbbpaeojofohoefgiehjai',
13
+ mimeTypes: [{ type: 'application/pdf', suffixes: 'pdf', description: '' }]
14
+ },
15
+ {
16
+ name: 'Native Client',
17
+ description: '',
18
+ filename: 'internal-nacl-plugin',
19
+ mimeTypes: [
20
+ { type: 'application/x-nacl', suffixes: '', description: 'Native Client Executable' },
21
+ { type: 'application/x-pnacl', suffixes: '', description: 'Portable Native Client Executable' }
22
+ ]
23
+ }
24
+ ];
25
+
26
+ const createMimeType = (mt, plugin) => {
27
+ const mimeType = Object.create(MimeType.prototype);
28
+ overrideGetter(mimeType, 'type', mt.type);
29
+ overrideGetter(mimeType, 'suffixes', mt.suffixes);
30
+ overrideGetter(mimeType, 'description', mt.description);
31
+ overrideGetter(mimeType, 'enabledPlugin', plugin);
32
+ return mimeType;
33
+ };
34
+
35
+ const createPlugin = (p) => {
36
+ const plugin = Object.create(Plugin.prototype);
37
+ overrideGetter(plugin, 'name', p.name);
38
+ overrideGetter(plugin, 'description', p.description);
39
+ overrideGetter(plugin, 'filename', p.filename);
40
+ overrideGetter(plugin, 'length', p.mimeTypes.length);
41
+ p.mimeTypes.forEach((mt, i) => {
42
+ const mimeType = createMimeType(mt, plugin);
43
+ plugin[i] = mimeType;
44
+ plugin[mt.type] = mimeType;
45
+ });
46
+ plugin[Symbol.iterator] = function* () {
47
+ for (let i = 0; i < this.length; i++) yield this[i];
48
+ };
49
+ return plugin;
50
+ };
51
+
52
+ try {
53
+ const plugins = mockPlugins.map(createPlugin);
54
+ const pluginArray = Object.create(PluginArray.prototype);
55
+ plugins.forEach((p, i) => {
56
+ pluginArray[i] = p;
57
+ pluginArray[p.name] = p;
58
+ });
59
+ overrideGetter(pluginArray, 'length', plugins.length);
60
+ pluginArray[Symbol.iterator] = function* () {
61
+ for (let i = 0; i < this.length; i++) yield this[i];
62
+ };
63
+ overrideGetter(navigator, 'plugins', pluginArray);
64
+ } catch (e) {}
@@ -0,0 +1,4 @@
1
+ // Spoof navigator.vendor and navigator.platform
2
+ overrideGetter(navigator, 'vendor', 'Google Inc.');
3
+ overrideGetter(navigator, 'platform', 'Win32');
4
+ overrideGetter(navigator, 'maxTouchPoints', 0);
@@ -0,0 +1,12 @@
1
+ // Clean HeadlessChrome from user agent string
2
+ const ua = navigator.userAgent;
3
+ if (ua.includes('HeadlessChrome')) {
4
+ const cleanUA = ua.replace('HeadlessChrome', 'Chrome');
5
+ overrideGetter(navigator, 'userAgent', cleanUA);
6
+
7
+ // Also fix appVersion
8
+ const appVersion = navigator.appVersion;
9
+ if (appVersion.includes('HeadlessChrome')) {
10
+ overrideGetter(navigator, 'appVersion', appVersion.replace('HeadlessChrome', 'Chrome'));
11
+ }
12
+ }
@@ -0,0 +1,24 @@
1
+ // Stealth utility helpers
2
+ const makeNativeToString = (fn, name = '') => {
3
+ const handler = {
4
+ apply: function(target, ctx, args) {
5
+ if (ctx === Function.prototype.toString) {
6
+ return `function toString() { [native code] }`;
7
+ }
8
+ return `function ${name || fn.name || ''}() { [native code] }`;
9
+ }
10
+ };
11
+ const proxy = new Proxy(Function.prototype.toString, handler);
12
+ try {
13
+ Function.prototype.toString = proxy;
14
+ } catch (e) {}
15
+ };
16
+
17
+ const overrideGetter = (obj, prop, value) => {
18
+ try {
19
+ Object.defineProperty(obj, prop, {
20
+ get: () => value,
21
+ configurable: true
22
+ });
23
+ } catch (e) {}
24
+ };
@@ -0,0 +1,16 @@
1
+ // Remove navigator.webdriver flag
2
+ Object.defineProperty(navigator, 'webdriver', {
3
+ get: () => false,
4
+ configurable: true
5
+ });
6
+
7
+ // Also handle the permissions API check
8
+ if (navigator.permissions) {
9
+ const originalQuery = navigator.permissions.query;
10
+ navigator.permissions.query = (parameters) => {
11
+ if (parameters.name === 'notifications') {
12
+ return Promise.resolve({ state: Notification.permission });
13
+ }
14
+ return originalQuery.call(navigator.permissions, parameters);
15
+ };
16
+ }
@@ -0,0 +1,27 @@
1
+ // Override WebGL vendor/renderer to hide headless indicators
2
+ const getParameter = WebGLRenderingContext.prototype.getParameter;
3
+ WebGLRenderingContext.prototype.getParameter = function(parameter) {
4
+ // UNMASKED_VENDOR_WEBGL
5
+ if (parameter === 37445) {
6
+ return 'Intel Inc.';
7
+ }
8
+ // UNMASKED_RENDERER_WEBGL
9
+ if (parameter === 37446) {
10
+ return 'Intel Iris OpenGL Engine';
11
+ }
12
+ return getParameter.call(this, parameter);
13
+ };
14
+
15
+ // Also handle WebGL2
16
+ if (typeof WebGL2RenderingContext !== 'undefined') {
17
+ const getParameter2 = WebGL2RenderingContext.prototype.getParameter;
18
+ WebGL2RenderingContext.prototype.getParameter = function(parameter) {
19
+ if (parameter === 37445) {
20
+ return 'Intel Inc.';
21
+ }
22
+ if (parameter === 37446) {
23
+ return 'Intel Iris OpenGL Engine';
24
+ }
25
+ return getParameter2.call(this, parameter);
26
+ };
27
+ }
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module AgentFerrum
4
+ VERSION = "0.1.0"
5
+ end
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ module AgentFerrum
4
+ class Waiter
5
+ def initialize(browser, default_timeout:, default_interval:)
6
+ @browser = browser
7
+ @default_timeout = default_timeout
8
+ @default_interval = default_interval
9
+ end
10
+
11
+ def call(css: nil, xpath: nil, text: nil, timeout: nil, interval: nil, &)
12
+ timeout ||= @default_timeout
13
+ interval ||= @default_interval
14
+ deadline = Process.clock_gettime(Process::CLOCK_MONOTONIC) + timeout
15
+
16
+ loop do
17
+ result = check_condition(css:, xpath:, text:, &)
18
+ return result if result
19
+
20
+ remaining = deadline - Process.clock_gettime(Process::CLOCK_MONOTONIC)
21
+ raise Waiter::TimeoutError, build_message(css:, xpath:, text:, timeout:) if remaining <= 0
22
+
23
+ sleep [interval, remaining].min
24
+ end
25
+ end
26
+
27
+ class TimeoutError < AgentFerrum::Error; end
28
+
29
+ private
30
+
31
+ def check_condition(css:, xpath:, text:, &block)
32
+ if block
33
+ block.call(@browser)
34
+ elsif css
35
+ @browser.ferrum.at_css(css)
36
+ elsif xpath
37
+ @browser.ferrum.at_xpath(xpath)
38
+ elsif text
39
+ @browser.ferrum.at_xpath("//*[contains(text(), '#{escape_xpath(text)}')]")
40
+ end
41
+ rescue Ferrum::NodeNotFoundError
42
+ nil
43
+ end
44
+
45
+ def escape_xpath(str)
46
+ str.gsub("'", "\\\\'")
47
+ end
48
+
49
+ def build_message(css:, xpath:, text:, timeout:)
50
+ target = css || xpath || text || "block condition"
51
+ "Timeout (#{timeout}s) waiting for: #{target}"
52
+ end
53
+ end
54
+ end