RubyGems - spidy - Versions diffs - 1.0.1 → 1.1.1 - Mend

spidy 1.0.1 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/Gemfile.lock +1 -1
data/example/detail.rb +77 -0
data/example/lightpanda_define.rb +28 -0
data/lib/spidy/binder/lightpanda.rb +26 -0
data/lib/spidy/binder.rb +3 -0
data/lib/spidy/version.rb +1 -1
metadata +5 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: f1eebbdb7391e07252e47811959d8a126fe3566c50b2cd5a98d17d1ad4eaff14
-  data.tar.gz: 2a3fea8707650684d72d4b6a26ac253acf413bba4cb9cc7eb89eb2aea4a17130
+  metadata.gz: 044aaddab677c3b7d4e482c0c17d3f02de1bbc84d767cb1f78dfde5f4be0f9ee
+  data.tar.gz: d2f6a2bd0e346b977e30d9bbf785eb6374013736110c0a03b086bc225c71e80e
 SHA512:
-  metadata.gz: 4744da97bed048c7c832543e747c4152b391e407ff70bba62bc730f2b18ea5fb9d97a9552ab54010d9693eaed41bd695dd782e5fb6cf0578974705c0cdc46b14
-  data.tar.gz: ea20552346def655974bd8e348e2320d12060b97e3fc0a44c410ae01a3db57a5d7f8fa0f2d7a3639d9b8d7c1128c05c59648ac09ca2e219bd1e20fff6948a1aa
+  metadata.gz: d7f5753032274c5446b2de13f6ca3487f501898d83056d39d3096522f592842b3ee7f77ed39382429bf444778f29d6394a83cee912430963062efad4eb354bc9
+  data.tar.gz: 8f144620039a59238866daa983d5790cf7c6e47df1f53a550736eba452a5f08f28941730f5e9e7991e537063c820bb1a9121a2e3335cd81ef33562bd3b20c52d

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    spidy (1.0.0)
+    spidy (1.1.1)
       activesupport (~> 7.1)
       mechanize
       socksify

data/example/detail.rb ADDED Viewed

@@ -0,0 +1,77 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+require_relative '../lib/spidy'
+# Define a scraper that uses the Lightpanda connector for Instagram profile
+scraper = Spidy.define do
+  # Set a user agent to mimic a desktop browser
+  user_agent 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' \
+             'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+  # Define a spider that uses the Lightpanda connector
+  spider(as: :lightpanda) do |yielder, connector, url|
+    puts "Processing Instagram profile: #{url}" if ENV['DEBUG']
+    connector.call(url) do |page|
+      yielder.call(page)
+    end
+  end
+  # Define the object structure we want to extract
+  define(as: :lightpanda) do
+    let(:title) { html.title }
+    let(:profile_name) { html.at('meta[property="og:title"]')&.[]('content') }
+    let(:description) { html.at('meta[property="og:description"]')&.[]('content') }
+    let(:image) { html.at('meta[property="og:image"]')&.[]('content') }
+    let(:followers_count) do
+      # Try to extract followers count from various possible locations
+      text = html.css('span._ac2a, span._aacl, span[data-count]').detect do |el|
+        el.text =~ /followers/i
+      end&.text
+      if text
+        # Extract just the number
+        text.gsub(/[^\d,\.]/, '').gsub(/[,\.]/, '').to_i
+      else
+        nil
+      end
+    end
+    let(:posts) do
+      html.css('article img').map do |img|
+        {
+          src: img['src'],
+          alt: img['alt']
+        }
+      end
+    end
+  end
+end
+# Get the URL from stdin
+url = STDIN.read.strip
+begin
+  # Run the scraper
+  result = scraper.call(url)
+  # Output the results
+  puts "---- Instagram Profile: #{result.profile_name} ----"
+  puts "Description: #{result.description}"
+  puts "Followers: #{result.followers_count || 'Unable to extract'}"
+  puts "\nProfile Image: #{result.image}" if result.image
+  puts "\nRecent Posts: #{result.posts.size}"
+  result.posts.each_with_index do |post, i|
+    puts "#{i+1}. #{post[:alt] || 'No description'}"
+    puts "   #{post[:src]}"
+  end
+rescue StandardError => e
+  puts "Error processing profile: #{e.message}"
+  puts e.backtrace if ENV['DEBUG']
+  exit 1
+end

data/example/lightpanda_define.rb ADDED Viewed

@@ -0,0 +1,28 @@
+require 'spidy'
+# Define a scraper using LightPanda (JavaScript-rendered pages)
+definition = Spidy::Shell.run do
+  user_agent 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
+  # Define a scraper using LightPanda
+  define(as: :lightpanda) do
+    let(:title) { html.title }
+    let(:headings) { html.css('h1, h2, h3').map { it.text.strip } }
+    let(:links) { html.css('a').map { |a| { href: a['href'], text: a.text.strip } } }
+  end
+end
+# Execute the scraper on a JavaScript-heavy website
+result = definition.call('https://www.example.com')
+puts "Title: #{result.title}"
+puts "\nHeadings:"
+result.headings.each do |heading|
+  puts "- #{heading}"
+end
+puts "\nLinks:"
+result.links.each do |link|
+  puts "- #{link[:text]} (#{link[:href]})"
+end

data/lib/spidy/binder/lightpanda.rb ADDED Viewed

@@ -0,0 +1,26 @@
+#
+# Bind LightPanda page and convert to object
+#
+module Spidy::Binder::Lightpanda
+  def let(name, query = nil, &block)
+    @attribute_names ||= []
+    @attribute_names << name
+    return define_method(name) { html.at(query)&.text&.strip } if block.nil?
+    define_method(name) do
+      if query.present?
+        instance_exec(html.at(query), &block)
+      else
+        instance_exec(&block)
+      end
+    rescue StandardError => e
+      raise Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
+    end
+  end
+  def self.extended(obj)
+    obj.alias_method :html, :resource
+  end
+end

data/lib/spidy/binder.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 #
 # Bind resource received from the connection to the result object
 #
@@ -7,4 +9,5 @@ module Spidy::Binder
   autoload :Json
   autoload :Html
   autoload :Xml
+  autoload :Lightpanda
 end

data/lib/spidy/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Spidy
-  VERSION = '1.0.1'.freeze
+  VERSION = '1.1.1'.freeze
 end

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: spidy
 version: !ruby/object:Gem::Version
-  version: 1.0.1
+  version: 1.1.1
 platform: ruby
 authors:
 - aileron
 bindir: exe
 cert_chain: []
-date: 2025-04-16 00:00:00.000000000 Z
+date: 2025-05-20 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: activesupport
@@ -90,6 +90,8 @@ files:
 - example/check_ferrum.rb
 - example/check_lightpanda.rb
 - example/connect_test.rb
+- example/detail.rb
+- example/lightpanda_define.rb
 - example/lightpanda_links.rb
 - example/master_detail.rb
 - example/proxy.rb
@@ -104,6 +106,7 @@ files:
 - lib/spidy/binder/error.rb
 - lib/spidy/binder/html.rb
 - lib/spidy/binder/json.rb
+- lib/spidy/binder/lightpanda.rb
 - lib/spidy/binder/xml.rb
 - lib/spidy/command_line.rb
 - lib/spidy/connector.rb