spidy 1.0.1 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/example/detail.rb +77 -0
- data/example/lightpanda_define.rb +28 -0
- data/lib/spidy/binder/lightpanda.rb +26 -0
- data/lib/spidy/binder.rb +3 -0
- data/lib/spidy/version.rb +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 044aaddab677c3b7d4e482c0c17d3f02de1bbc84d767cb1f78dfde5f4be0f9ee
|
4
|
+
data.tar.gz: d2f6a2bd0e346b977e30d9bbf785eb6374013736110c0a03b086bc225c71e80e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d7f5753032274c5446b2de13f6ca3487f501898d83056d39d3096522f592842b3ee7f77ed39382429bf444778f29d6394a83cee912430963062efad4eb354bc9
|
7
|
+
data.tar.gz: 8f144620039a59238866daa983d5790cf7c6e47df1f53a550736eba452a5f08f28941730f5e9e7991e537063c820bb1a9121a2e3335cd81ef33562bd3b20c52d
|
data/Gemfile.lock
CHANGED
data/example/detail.rb
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require_relative '../lib/spidy'
|
5
|
+
|
6
|
+
# Define a scraper that uses the Lightpanda connector for Instagram profile
|
7
|
+
scraper = Spidy.define do
|
8
|
+
# Set a user agent to mimic a desktop browser
|
9
|
+
user_agent 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' \
|
10
|
+
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
11
|
+
|
12
|
+
# Define a spider that uses the Lightpanda connector
|
13
|
+
spider(as: :lightpanda) do |yielder, connector, url|
|
14
|
+
puts "Processing Instagram profile: #{url}" if ENV['DEBUG']
|
15
|
+
connector.call(url) do |page|
|
16
|
+
yielder.call(page)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
# Define the object structure we want to extract
|
21
|
+
define(as: :lightpanda) do
|
22
|
+
let(:title) { html.title }
|
23
|
+
|
24
|
+
let(:profile_name) { html.at('meta[property="og:title"]')&.[]('content') }
|
25
|
+
|
26
|
+
let(:description) { html.at('meta[property="og:description"]')&.[]('content') }
|
27
|
+
|
28
|
+
let(:image) { html.at('meta[property="og:image"]')&.[]('content') }
|
29
|
+
|
30
|
+
let(:followers_count) do
|
31
|
+
# Try to extract followers count from various possible locations
|
32
|
+
text = html.css('span._ac2a, span._aacl, span[data-count]').detect do |el|
|
33
|
+
el.text =~ /followers/i
|
34
|
+
end&.text
|
35
|
+
|
36
|
+
if text
|
37
|
+
# Extract just the number
|
38
|
+
text.gsub(/[^\d,\.]/, '').gsub(/[,\.]/, '').to_i
|
39
|
+
else
|
40
|
+
nil
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
let(:posts) do
|
45
|
+
html.css('article img').map do |img|
|
46
|
+
{
|
47
|
+
src: img['src'],
|
48
|
+
alt: img['alt']
|
49
|
+
}
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# Get the URL from stdin
|
56
|
+
url = STDIN.read.strip
|
57
|
+
|
58
|
+
begin
|
59
|
+
# Run the scraper
|
60
|
+
result = scraper.call(url)
|
61
|
+
|
62
|
+
# Output the results
|
63
|
+
puts "---- Instagram Profile: #{result.profile_name} ----"
|
64
|
+
puts "Description: #{result.description}"
|
65
|
+
puts "Followers: #{result.followers_count || 'Unable to extract'}"
|
66
|
+
puts "\nProfile Image: #{result.image}" if result.image
|
67
|
+
|
68
|
+
puts "\nRecent Posts: #{result.posts.size}"
|
69
|
+
result.posts.each_with_index do |post, i|
|
70
|
+
puts "#{i+1}. #{post[:alt] || 'No description'}"
|
71
|
+
puts " #{post[:src]}"
|
72
|
+
end
|
73
|
+
rescue StandardError => e
|
74
|
+
puts "Error processing profile: #{e.message}"
|
75
|
+
puts e.backtrace if ENV['DEBUG']
|
76
|
+
exit 1
|
77
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'spidy'
|
2
|
+
|
3
|
+
# Define a scraper using LightPanda (JavaScript-rendered pages)
|
4
|
+
definition = Spidy::Shell.run do
|
5
|
+
user_agent 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
|
6
|
+
|
7
|
+
# Define a scraper using LightPanda
|
8
|
+
define(as: :lightpanda) do
|
9
|
+
let(:title) { html.title }
|
10
|
+
let(:headings) { html.css('h1, h2, h3').map { it.text.strip } }
|
11
|
+
let(:links) { html.css('a').map { |a| { href: a['href'], text: a.text.strip } } }
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
# Execute the scraper on a JavaScript-heavy website
|
16
|
+
result = definition.call('https://www.example.com')
|
17
|
+
|
18
|
+
puts "Title: #{result.title}"
|
19
|
+
puts "\nHeadings:"
|
20
|
+
result.headings.each do |heading|
|
21
|
+
puts "- #{heading}"
|
22
|
+
end
|
23
|
+
|
24
|
+
puts "\nLinks:"
|
25
|
+
result.links.each do |link|
|
26
|
+
puts "- #{link[:text]} (#{link[:href]})"
|
27
|
+
end
|
28
|
+
|
@@ -0,0 +1,26 @@
|
|
1
|
+
#
|
2
|
+
# Bind LightPanda page and convert to object
|
3
|
+
#
|
4
|
+
module Spidy::Binder::Lightpanda
|
5
|
+
def let(name, query = nil, &block)
|
6
|
+
@attribute_names ||= []
|
7
|
+
@attribute_names << name
|
8
|
+
|
9
|
+
return define_method(name) { html.at(query)&.text&.strip } if block.nil?
|
10
|
+
|
11
|
+
define_method(name) do
|
12
|
+
if query.present?
|
13
|
+
instance_exec(html.at(query), &block)
|
14
|
+
else
|
15
|
+
instance_exec(&block)
|
16
|
+
end
|
17
|
+
rescue StandardError => e
|
18
|
+
raise Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.extended(obj)
|
23
|
+
obj.alias_method :html, :resource
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
data/lib/spidy/binder.rb
CHANGED
data/lib/spidy/version.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidy
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- aileron
|
8
8
|
bindir: exe
|
9
9
|
cert_chain: []
|
10
|
-
date: 2025-
|
10
|
+
date: 2025-05-20 00:00:00.000000000 Z
|
11
11
|
dependencies:
|
12
12
|
- !ruby/object:Gem::Dependency
|
13
13
|
name: activesupport
|
@@ -90,6 +90,8 @@ files:
|
|
90
90
|
- example/check_ferrum.rb
|
91
91
|
- example/check_lightpanda.rb
|
92
92
|
- example/connect_test.rb
|
93
|
+
- example/detail.rb
|
94
|
+
- example/lightpanda_define.rb
|
93
95
|
- example/lightpanda_links.rb
|
94
96
|
- example/master_detail.rb
|
95
97
|
- example/proxy.rb
|
@@ -104,6 +106,7 @@ files:
|
|
104
106
|
- lib/spidy/binder/error.rb
|
105
107
|
- lib/spidy/binder/html.rb
|
106
108
|
- lib/spidy/binder/json.rb
|
109
|
+
- lib/spidy/binder/lightpanda.rb
|
107
110
|
- lib/spidy/binder/xml.rb
|
108
111
|
- lib/spidy/command_line.rb
|
109
112
|
- lib/spidy/connector.rb
|