spidy 1.0.1 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f1eebbdb7391e07252e47811959d8a126fe3566c50b2cd5a98d17d1ad4eaff14
4
- data.tar.gz: 2a3fea8707650684d72d4b6a26ac253acf413bba4cb9cc7eb89eb2aea4a17130
3
+ metadata.gz: 044aaddab677c3b7d4e482c0c17d3f02de1bbc84d767cb1f78dfde5f4be0f9ee
4
+ data.tar.gz: d2f6a2bd0e346b977e30d9bbf785eb6374013736110c0a03b086bc225c71e80e
5
5
  SHA512:
6
- metadata.gz: 4744da97bed048c7c832543e747c4152b391e407ff70bba62bc730f2b18ea5fb9d97a9552ab54010d9693eaed41bd695dd782e5fb6cf0578974705c0cdc46b14
7
- data.tar.gz: ea20552346def655974bd8e348e2320d12060b97e3fc0a44c410ae01a3db57a5d7f8fa0f2d7a3639d9b8d7c1128c05c59648ac09ca2e219bd1e20fff6948a1aa
6
+ metadata.gz: d7f5753032274c5446b2de13f6ca3487f501898d83056d39d3096522f592842b3ee7f77ed39382429bf444778f29d6394a83cee912430963062efad4eb354bc9
7
+ data.tar.gz: 8f144620039a59238866daa983d5790cf7c6e47df1f53a550736eba452a5f08f28941730f5e9e7991e537063c820bb1a9121a2e3335cd81ef33562bd3b20c52d
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- spidy (1.0.0)
4
+ spidy (1.1.1)
5
5
  activesupport (~> 7.1)
6
6
  mechanize
7
7
  socksify
data/example/detail.rb ADDED
@@ -0,0 +1,77 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require_relative '../lib/spidy'
5
+
6
+ # Define a scraper that uses the Lightpanda connector for Instagram profile
7
+ scraper = Spidy.define do
8
+ # Set a user agent to mimic a desktop browser
9
+ user_agent 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' \
10
+ 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
11
+
12
+ # Define a spider that uses the Lightpanda connector
13
+ spider(as: :lightpanda) do |yielder, connector, url|
14
+ puts "Processing Instagram profile: #{url}" if ENV['DEBUG']
15
+ connector.call(url) do |page|
16
+ yielder.call(page)
17
+ end
18
+ end
19
+
20
+ # Define the object structure we want to extract
21
+ define(as: :lightpanda) do
22
+ let(:title) { html.title }
23
+
24
+ let(:profile_name) { html.at('meta[property="og:title"]')&.[]('content') }
25
+
26
+ let(:description) { html.at('meta[property="og:description"]')&.[]('content') }
27
+
28
+ let(:image) { html.at('meta[property="og:image"]')&.[]('content') }
29
+
30
+ let(:followers_count) do
31
+ # Try to extract followers count from various possible locations
32
+ text = html.css('span._ac2a, span._aacl, span[data-count]').detect do |el|
33
+ el.text =~ /followers/i
34
+ end&.text
35
+
36
+ if text
37
+ # Extract just the number
38
+ text.gsub(/[^\d,\.]/, '').gsub(/[,\.]/, '').to_i
39
+ else
40
+ nil
41
+ end
42
+ end
43
+
44
+ let(:posts) do
45
+ html.css('article img').map do |img|
46
+ {
47
+ src: img['src'],
48
+ alt: img['alt']
49
+ }
50
+ end
51
+ end
52
+ end
53
+ end
54
+
55
+ # Get the URL from stdin
56
+ url = STDIN.read.strip
57
+
58
+ begin
59
+ # Run the scraper
60
+ result = scraper.call(url)
61
+
62
+ # Output the results
63
+ puts "---- Instagram Profile: #{result.profile_name} ----"
64
+ puts "Description: #{result.description}"
65
+ puts "Followers: #{result.followers_count || 'Unable to extract'}"
66
+ puts "\nProfile Image: #{result.image}" if result.image
67
+
68
+ puts "\nRecent Posts: #{result.posts.size}"
69
+ result.posts.each_with_index do |post, i|
70
+ puts "#{i+1}. #{post[:alt] || 'No description'}"
71
+ puts " #{post[:src]}"
72
+ end
73
+ rescue StandardError => e
74
+ puts "Error processing profile: #{e.message}"
75
+ puts e.backtrace if ENV['DEBUG']
76
+ exit 1
77
+ end
@@ -0,0 +1,28 @@
1
+ require 'spidy'
2
+
3
+ # Define a scraper using LightPanda (JavaScript-rendered pages)
4
+ definition = Spidy::Shell.run do
5
+ user_agent 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
6
+
7
+ # Define a scraper using LightPanda
8
+ define(as: :lightpanda) do
9
+ let(:title) { html.title }
10
+ let(:headings) { html.css('h1, h2, h3').map { it.text.strip } }
11
+ let(:links) { html.css('a').map { |a| { href: a['href'], text: a.text.strip } } }
12
+ end
13
+ end
14
+
15
+ # Execute the scraper on a JavaScript-heavy website
16
+ result = definition.call('https://www.example.com')
17
+
18
+ puts "Title: #{result.title}"
19
+ puts "\nHeadings:"
20
+ result.headings.each do |heading|
21
+ puts "- #{heading}"
22
+ end
23
+
24
+ puts "\nLinks:"
25
+ result.links.each do |link|
26
+ puts "- #{link[:text]} (#{link[:href]})"
27
+ end
28
+
@@ -0,0 +1,26 @@
1
+ #
2
+ # Bind LightPanda page and convert to object
3
+ #
4
+ module Spidy::Binder::Lightpanda
5
+ def let(name, query = nil, &block)
6
+ @attribute_names ||= []
7
+ @attribute_names << name
8
+
9
+ return define_method(name) { html.at(query)&.text&.strip } if block.nil?
10
+
11
+ define_method(name) do
12
+ if query.present?
13
+ instance_exec(html.at(query), &block)
14
+ else
15
+ instance_exec(&block)
16
+ end
17
+ rescue StandardError => e
18
+ raise Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
19
+ end
20
+ end
21
+
22
+ def self.extended(obj)
23
+ obj.alias_method :html, :resource
24
+ end
25
+ end
26
+
data/lib/spidy/binder.rb CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  #
2
4
  # Bind resource received from the connection to the result object
3
5
  #
@@ -7,4 +9,5 @@ module Spidy::Binder
7
9
  autoload :Json
8
10
  autoload :Html
9
11
  autoload :Xml
12
+ autoload :Lightpanda
10
13
  end
data/lib/spidy/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Spidy
2
- VERSION = '1.0.1'.freeze
2
+ VERSION = '1.1.1'.freeze
3
3
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidy
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - aileron
8
8
  bindir: exe
9
9
  cert_chain: []
10
- date: 2025-04-16 00:00:00.000000000 Z
10
+ date: 2025-05-20 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: activesupport
@@ -90,6 +90,8 @@ files:
90
90
  - example/check_ferrum.rb
91
91
  - example/check_lightpanda.rb
92
92
  - example/connect_test.rb
93
+ - example/detail.rb
94
+ - example/lightpanda_define.rb
93
95
  - example/lightpanda_links.rb
94
96
  - example/master_detail.rb
95
97
  - example/proxy.rb
@@ -104,6 +106,7 @@ files:
104
106
  - lib/spidy/binder/error.rb
105
107
  - lib/spidy/binder/html.rb
106
108
  - lib/spidy/binder/json.rb
109
+ - lib/spidy/binder/lightpanda.rb
107
110
  - lib/spidy/binder/xml.rb
108
111
  - lib/spidy/command_line.rb
109
112
  - lib/spidy/connector.rb