instaview 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +5 -0
- data/CODE_OF_CONDUCT.md +21 -0
- data/LICENSE.txt +21 -0
- data/README.md +202 -0
- data/Rakefile +4 -0
- data/lib/instaview/version.rb +5 -0
- data/lib/instaview.rb +489 -0
- metadata +111 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 6ecbb088043452c55b479846568262489b6c217798de96ccc0acdc7a6dfcfffb
|
4
|
+
data.tar.gz: b3353cca7a6219a42e4d5c31de08a3136b6760ba8f7fd3fc732febcb1881dd04
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 4430729542295f37cd35935cbb5589edf796c0940d87153281305c249697b1eaa257f4dba81dd8c15250954d17b8962fa56113d8d2ad3609d712e0f266243f8b
|
7
|
+
data.tar.gz: e1977fc370f81854906d66d983e5a08a3bb49aeacb1ae4faa9a2d1c144584ab3299588377bd10e829649cb86026be3e17e01b4c18865ce138d05748f32774904
|
data/CHANGELOG.md
ADDED
data/CODE_OF_CONDUCT.md
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
Code of Conduct & Legal Disclaimer
|
2
|
+
|
3
|
+
This project is provided as-is for scientific and research purposes only.
|
4
|
+
|
5
|
+
Disclaimer of Liability
|
6
|
+
|
7
|
+
The author does not assume and hereby disclaims any responsibility or liability for any use, misuse, or consequences—direct or indirect—arising from this code.
|
8
|
+
|
9
|
+
You are solely responsible for anything you do with it, including modifying, distributing, or deploying it.
|
10
|
+
|
11
|
+
The author does not provide any warranty, express or implied, and does not guarantee fitness for a particular purpose, safety, legality, or correctness of the code in any jurisdiction.
|
12
|
+
|
13
|
+
Use at your own risk.
|
14
|
+
|
15
|
+
No Support, No Warranty
|
16
|
+
|
17
|
+
This is a personal project. There is no official support, no community, and no maintenance commitment.
|
18
|
+
|
19
|
+
No Restrictions on Use
|
20
|
+
|
21
|
+
You are free to use, modify, redistribute, or ignore this project as you see fit. The author does not care how you use it and takes no responsibility for what you do with it.
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2025 Nicolas Reiner
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,202 @@
|
|
1
|
+
# Instaview
|
2
|
+
|
3
|
+
A Ruby gem that uses Selenium WebDriver to scrape Instagram stories and media from third-party services like StoriesIG. This gem provides a programmatic interface to fetch Instagram content without using the official Instagram API.
|
4
|
+
|
5
|
+
## Features
|
6
|
+
|
7
|
+
- Scrapes Instagram stories and media using Selenium WebDriver
|
8
|
+
- Targets StoriesIG.info for anonymous Instagram viewing
|
9
|
+
- Extracts media items with images, captions, download links, and metadata
|
10
|
+
- Headless browser automation with fallback HTTP method
|
11
|
+
- JSON output format with structured data
|
12
|
+
- Command-line interface with multiple methods
|
13
|
+
- Built-in local JSON cache with 12h TTL and async fetching
|
14
|
+
|
15
|
+
## Installation
|
16
|
+
|
17
|
+
Add this line to your application's Gemfile:
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
gem 'instaview'
|
21
|
+
```
|
22
|
+
|
23
|
+
And then execute:
|
24
|
+
|
25
|
+
$ bundle install
|
26
|
+
|
27
|
+
Or install it yourself as:
|
28
|
+
|
29
|
+
$ gem install instaview
|
30
|
+
|
31
|
+
## Setup
|
32
|
+
|
33
|
+
After installation, you may need to install Chrome/Chromium for the Selenium method:
|
34
|
+
|
35
|
+
```bash
|
36
|
+
# Ubuntu/Debian
|
37
|
+
sudo apt-get install chromium-browser
|
38
|
+
|
39
|
+
# Or run the setup script
|
40
|
+
./bin/setup_selenium
|
41
|
+
```
|
42
|
+
|
43
|
+
## Usage
|
44
|
+
|
45
|
+
### Command Line
|
46
|
+
|
47
|
+
```bash
|
48
|
+
# Test the gem
|
49
|
+
ruby bin/instaview test
|
50
|
+
|
51
|
+
# Using the Selenium method (default)
|
52
|
+
ruby bin/instaview instagram
|
53
|
+
|
54
|
+
# Using the HTTP method (fallback/diagnostic)
|
55
|
+
ruby bin/instaview instagram http
|
56
|
+
|
57
|
+
# Or if installed as a gem
|
58
|
+
instaview username_here http
|
59
|
+
```
|
60
|
+
|
61
|
+
### Ruby Code
|
62
|
+
|
63
|
+
```ruby
|
64
|
+
require 'instaview'
|
65
|
+
|
66
|
+
# Test connectivity
|
67
|
+
Instaview.test_connectivity
|
68
|
+
|
69
|
+
# Cache-first with 12h TTL; falls back to fetch when stale/missing
|
70
|
+
result = Instaview.get_from_cache_or_async("instagram")
|
71
|
+
|
72
|
+
# Cache-only (no network); returns nil if stale/missing
|
73
|
+
cached = Instaview.load_from_cache_only("instagram")
|
74
|
+
|
75
|
+
# Start an async fetch and get the result later
|
76
|
+
t = Instaview.fetch_data_async("instagram", method: :selenium)
|
77
|
+
result = t.value # waits for completion
|
78
|
+
|
79
|
+
puts result
|
80
|
+
```
|
81
|
+
|
82
|
+
### Example Output
|
83
|
+
|
84
|
+
#### Selenium Method (Full Automation)
|
85
|
+
```json
|
86
|
+
{
|
87
|
+
"username": "instagram",
|
88
|
+
"method": "selenium_storiesig",
|
89
|
+
"page_state": "media_found",
|
90
|
+
"media_items_found": 3,
|
91
|
+
"media_items": [
|
92
|
+
{
|
93
|
+
"image_url": "https://media.storiesig.info/get?__sig=...",
|
94
|
+
"alt_text": "preview",
|
95
|
+
"caption": "Some caption text.",
|
96
|
+
"download_url": "https://media.storiesig.info/get?__sig=...",
|
97
|
+
"likes": "8",
|
98
|
+
"time": "2 weeks ago",
|
99
|
+
"time_title": "2025-09-13T08:20:20Z"
|
100
|
+
}
|
101
|
+
],
|
102
|
+
"all_images": [
|
103
|
+
"https://media.storiesig.info/get?__sig=...",
|
104
|
+
"https://cdn.example.com/asset.jpg"
|
105
|
+
],
|
106
|
+
"download_links": [
|
107
|
+
"https://media.storiesig.info/get?__sig=..."
|
108
|
+
],
|
109
|
+
"error_message": null,
|
110
|
+
"success": true,
|
111
|
+
"debug_info": {
|
112
|
+
"total_images": 25,
|
113
|
+
"total_links": 80
|
114
|
+
}
|
115
|
+
/* When served from cache, the top-level object may also include:
|
116
|
+
"cached": true
|
117
|
+
*/
|
118
|
+
}
|
119
|
+
```
|
120
|
+
|
121
|
+
#### HTTP Method (Page Analysis)
|
122
|
+
```json
|
123
|
+
{
|
124
|
+
"username": "instagram",
|
125
|
+
"method": "simple_http_curl",
|
126
|
+
"forms_found": 1,
|
127
|
+
"inputs_found": 1,
|
128
|
+
"sample_images": [
|
129
|
+
"https://cdn.example.com/image1.jpg",
|
130
|
+
"https://cdn.example.com/image2.jpg"
|
131
|
+
],
|
132
|
+
"message": "Simple HTTP method using curl - shows page structure. For full automation use selenium method."
|
133
|
+
}
|
134
|
+
```
|
135
|
+
|
136
|
+
## Caching
|
137
|
+
|
138
|
+
Instaview stores successful results as JSON cache files per username. By default:
|
139
|
+
|
140
|
+
- Cache directory: `~/.cache/instaview` (override with `INSTAVIEW_CACHE_DIR`)
|
141
|
+
- File name: `<username>.json`
|
142
|
+
- Default TTL: 12 hours
|
143
|
+
|
144
|
+
You will see `"cached": true` in the JSON when the result was served from cache.
|
145
|
+
|
146
|
+
## Dependencies
|
147
|
+
|
148
|
+
- Ruby >= 3.2.0
|
149
|
+
- selenium-webdriver (~> 4.0)
|
150
|
+
- httparty (~> 0.21)
|
151
|
+
- nokogiri (~> 1.15)
|
152
|
+
- json (~> 2.0)
|
153
|
+
- curl (system dependency)
|
154
|
+
|
155
|
+
## Development
|
156
|
+
|
157
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
158
|
+
|
159
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
160
|
+
|
161
|
+
## Troubleshooting
|
162
|
+
|
163
|
+
### Selenium Issues
|
164
|
+
|
165
|
+
If you encounter issues with Selenium WebDriver:
|
166
|
+
|
167
|
+
1. Make sure Chrome/Chromium is installed:
|
168
|
+
```bash
|
169
|
+
sudo apt-get install chromium-browser
|
170
|
+
# or
|
171
|
+
./bin/setup_selenium
|
172
|
+
```
|
173
|
+
|
174
|
+
2. Use the HTTP method as a fallback:
|
175
|
+
```bash
|
176
|
+
instaview username http
|
177
|
+
```
|
178
|
+
|
179
|
+
3. Check if the selectors need updating (websites change frequently)
|
180
|
+
|
181
|
+
### General Issues
|
182
|
+
|
183
|
+
- Some sites have anti-bot measures that may block automated requests
|
184
|
+
- The HTTP method provides basic page structure analysis
|
185
|
+
- For full automation, Selenium is needed but may face restrictions
|
186
|
+
- If you see Selenium logs on every run, it likely means cache is empty or stale; run twice within 12h to observe cached responses.
|
187
|
+
|
188
|
+
## Legal Notice
|
189
|
+
|
190
|
+
This gem is for educational purposes only. Please respect Instagram's terms of service and the terms of service of any third-party websites you scrape. The authors are not responsible for any misuse of this software.
|
191
|
+
|
192
|
+
## Contributing
|
193
|
+
|
194
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/nicolasreiner/instaview. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/nicolasreiner/instaview/blob/master/CODE_OF_CONDUCT.md).
|
195
|
+
|
196
|
+
## License
|
197
|
+
|
198
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
199
|
+
|
200
|
+
## Code of Conduct
|
201
|
+
|
202
|
+
Everyone interacting in the Instaview project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/nicolasreiner/instaview/blob/master/CODE_OF_CONDUCT.md).
|
data/Rakefile
ADDED
data/lib/instaview.rb
ADDED
@@ -0,0 +1,489 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "instaview/version"
|
4
|
+
require 'selenium-webdriver'
|
5
|
+
require 'nokogiri'
|
6
|
+
require 'json'
|
7
|
+
require 'net/http'
|
8
|
+
require 'uri'
|
9
|
+
require 'fileutils'
|
10
|
+
require 'time'
|
11
|
+
|
12
|
+
module Instaview
|
13
|
+
class Error < StandardError; end
|
14
|
+
|
15
|
+
# @description
|
16
|
+
# Default accessor that returns data for a username. Tries cache first (12h TTL),
|
17
|
+
# otherwise performs an async fetch and returns the fetched result.
|
18
|
+
# @Parameter
|
19
|
+
# username: String - Instagram username to retrieve data for (required)
|
20
|
+
# @Return values
|
21
|
+
# Hash - Parsed result of the scraping, potentially annotated with `:cached => true`
|
22
|
+
# @Errors
|
23
|
+
# ArgumentError - if `username` is nil or empty
|
24
|
+
def self.getData(username = nil)
|
25
|
+
# Default data accessor: try cache first (12h TTL), otherwise fetch asynchronously and return result
|
26
|
+
raise ArgumentError, "username is required" if username.nil? || username.to_s.strip.empty?
|
27
|
+
get_from_cache_or_async(username, max_age_hours: 12)
|
28
|
+
end
|
29
|
+
|
30
|
+
# @description
|
31
|
+
# Start an asynchronous fetch for the given username and write the result to cache on success.
|
32
|
+
# @Parameter
|
33
|
+
# username: String - Instagram username (required)
|
34
|
+
# method: Symbol - :selenium (default) or :simple_http to choose the scraping backend
|
35
|
+
# @Return values
|
36
|
+
# Thread - The started thread; `thread.value` returns the Hash result when finished
|
37
|
+
# @Errors
|
38
|
+
# ArgumentError - if `username` is nil or empty
|
39
|
+
# RuntimeError or StandardError - on scraping failures raised inside the thread when joining
|
40
|
+
def self.fetch_data_async(username, method: :selenium)
|
41
|
+
raise ArgumentError, "username is required" if username.nil? || username.to_s.strip.empty?
|
42
|
+
|
43
|
+
Thread.new do
|
44
|
+
begin
|
45
|
+
result = case method
|
46
|
+
when :selenium
|
47
|
+
scrape_instagram_stories(username)
|
48
|
+
when :simple_http
|
49
|
+
scrape_with_simple_http(username)
|
50
|
+
else
|
51
|
+
scrape_instagram_stories(username)
|
52
|
+
end
|
53
|
+
|
54
|
+
# Persist to cache on success
|
55
|
+
begin
|
56
|
+
write_to_cache(username, result)
|
57
|
+
rescue => e
|
58
|
+
warn "Instaview: failed to write cache for #{username}: #{e.message}"
|
59
|
+
end
|
60
|
+
|
61
|
+
result
|
62
|
+
rescue => e
|
63
|
+
warn "Instaview: async fetch failed for #{username}: #{e.message}"
|
64
|
+
raise
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
# @description
|
70
|
+
# Try to load from local cache; if missing or older than the given TTL, fetch via async and return the fresh result.
|
71
|
+
# @Parameter
|
72
|
+
# username: String - Instagram username (required)
|
73
|
+
# max_age_hours: Integer - Cache TTL in hours (default: 12)
|
74
|
+
# method: Symbol - :selenium (default) or :simple_http
|
75
|
+
# @Return values
|
76
|
+
# Hash - Cached or freshly fetched data; cached results will include `:cached => true`
|
77
|
+
# @Errors
|
78
|
+
# ArgumentError - if `username` is nil or empty
|
79
|
+
def self.get_from_cache_or_async(username, max_age_hours: 12, method: :selenium)
|
80
|
+
max_age_seconds = (max_age_hours.to_i * 3600)
|
81
|
+
cached = read_from_cache(username, max_age_seconds: max_age_seconds)
|
82
|
+
if cached
|
83
|
+
puts "Using cached data for #{username}"
|
84
|
+
return cached
|
85
|
+
end
|
86
|
+
puts "No valid cache found for #{username}, fetching data..."
|
87
|
+
t = fetch_data_async(username, method: method)
|
88
|
+
t.value # join and return result
|
89
|
+
end
|
90
|
+
|
91
|
+
# @description
|
92
|
+
# Return cached data if present and fresh; otherwise return nil. This method does not perform network I/O.
|
93
|
+
# @Parameter
|
94
|
+
# username: String - Instagram username (required)
|
95
|
+
# max_age_hours: Integer - Cache TTL in hours (default: 12)
|
96
|
+
# @Return values
|
97
|
+
# Hash or nil - Cached data Hash if fresh; nil if missing or stale
|
98
|
+
# @Errors
|
99
|
+
# ArgumentError - if `username` is nil or empty
|
100
|
+
def self.load_from_cache_only(username, max_age_hours: 12)
|
101
|
+
raise ArgumentError, "username is required" if username.nil? || username.to_s.strip.empty?
|
102
|
+
max_age_seconds = (max_age_hours.to_i * 3600)
|
103
|
+
read_from_cache(username, max_age_seconds: max_age_seconds)
|
104
|
+
end
|
105
|
+
|
106
|
+
# --- Cache helpers ---
|
107
|
+
# @description
|
108
|
+
# Resolve the directory used to store cache files.
|
109
|
+
# @Parameter
|
110
|
+
# None
|
111
|
+
# @Return values
|
112
|
+
# String - Absolute path to the cache directory; defaults to ~/.cache/instaview, overridable by INSTAVIEW_CACHE_DIR
|
113
|
+
# @Errors
|
114
|
+
# None
|
115
|
+
def self.cache_dir
|
116
|
+
ENV['INSTAVIEW_CACHE_DIR'] || File.join(Dir.home, ".cache", "instaview")
|
117
|
+
end
|
118
|
+
|
119
|
+
# @description
|
120
|
+
# Compute the cache file path for a given username.
|
121
|
+
# @Parameter
|
122
|
+
# username: String - Instagram username
|
123
|
+
# @Return values
|
124
|
+
# String - Full path to the JSON cache file for the username
|
125
|
+
# @Errors
|
126
|
+
# None
|
127
|
+
def self.cache_file_for(username)
|
128
|
+
sanitized = username.to_s.gsub(/[^a-zA-Z0-9_\-.]/, '_')
|
129
|
+
File.join(cache_dir, "#{sanitized}.json")
|
130
|
+
end
|
131
|
+
|
132
|
+
# @description
|
133
|
+
# Read a cached result for username if the file exists and is within the max age.
|
134
|
+
# @Parameter
|
135
|
+
# username: String - Instagram username
|
136
|
+
# max_age_seconds: Integer - Maximum cache age in seconds (default: 43_200 => 12h)
|
137
|
+
# @Return values
|
138
|
+
# Hash or nil - Parsed JSON data with `:cached => true` added, or nil if stale/missing/corrupt
|
139
|
+
# @Errors
|
140
|
+
# JSON::ParserError is rescued internally; returns nil when parse fails
|
141
|
+
def self.read_from_cache(username, max_age_seconds: 43_200)
|
142
|
+
path = cache_file_for(username)
|
143
|
+
return nil unless File.exist?(path)
|
144
|
+
|
145
|
+
age = Time.now - File.mtime(path)
|
146
|
+
return nil if age > max_age_seconds
|
147
|
+
|
148
|
+
content = File.read(path)
|
149
|
+
data = JSON.parse(content, symbolize_names: true)
|
150
|
+
# annotate so callers can tell it came from cache
|
151
|
+
if data.is_a?(Hash)
|
152
|
+
data[:cached] = true
|
153
|
+
end
|
154
|
+
data
|
155
|
+
rescue JSON::ParserError
|
156
|
+
nil
|
157
|
+
end
|
158
|
+
|
159
|
+
# @description
|
160
|
+
# Write the provided data Hash to the cache file for the username.
|
161
|
+
# @Parameter
|
162
|
+
# username: String - Instagram username
|
163
|
+
# data: Hash - Data to persist as JSON
|
164
|
+
# @Return values
|
165
|
+
# true - On success
|
166
|
+
# @Errors
|
167
|
+
# StandardError - on underlying file I/O @errors
|
168
|
+
def self.write_to_cache(username, data)
|
169
|
+
FileUtils.mkdir_p(cache_dir)
|
170
|
+
File.write(cache_file_for(username), JSON.pretty_generate(data))
|
171
|
+
true
|
172
|
+
end
|
173
|
+
|
174
|
+
# @description
|
175
|
+
# Use Selenium WebDriver to automate StoriesIG and extract media details for a username.
|
176
|
+
# @Parameter
|
177
|
+
# username: String - Instagram username (required)
|
178
|
+
# @Return values
|
179
|
+
# Hash - Structured result including extracted media and metadata
|
180
|
+
# @Errors
|
181
|
+
# StandardError - on Selenium/WebDriver failures or selector timeouts
|
182
|
+
def self.scrape_instagram_stories(username = nil)
|
183
|
+
target_username = username || ARGV[0] # pass username as argument
|
184
|
+
|
185
|
+
begin
|
186
|
+
# Setup Selenium WebDriver with headless Chrome
|
187
|
+
options = Selenium::WebDriver::Chrome::Options.new
|
188
|
+
options.add_argument('--headless=new')
|
189
|
+
options.add_argument('--no-sandbox')
|
190
|
+
options.add_argument('--disable-dev-shm-usage')
|
191
|
+
options.add_argument('--disable-gpu')
|
192
|
+
options.add_argument('--disable-extensions')
|
193
|
+
options.add_argument('--disable-background-timer-throttling')
|
194
|
+
options.add_argument('--disable-backgrounding-occluded-windows')
|
195
|
+
options.add_argument('--disable-renderer-backgrounding')
|
196
|
+
options.add_argument('--window-size=1920,1080')
|
197
|
+
options.add_argument('--remote-debugging-port=9222')
|
198
|
+
options.add_argument('--user-data-dir=/tmp/chrome-user-data')
|
199
|
+
|
200
|
+
# Try different Chrome/Chromium binaries
|
201
|
+
chrome_paths = [
|
202
|
+
"/snap/bin/chromium",
|
203
|
+
"/usr/bin/chromium",
|
204
|
+
"/usr/bin/chromium-browser",
|
205
|
+
"/usr/bin/google-chrome"
|
206
|
+
]
|
207
|
+
|
208
|
+
chrome_binary = chrome_paths.find { |path| File.exist?(path) }
|
209
|
+
|
210
|
+
if chrome_binary
|
211
|
+
options.binary = chrome_binary
|
212
|
+
puts "Using Chrome binary: #{chrome_binary}"
|
213
|
+
end
|
214
|
+
|
215
|
+
driver = Selenium::WebDriver.for :chrome, options: options
|
216
|
+
|
217
|
+
# 1) Go to StoriesIG homepage
|
218
|
+
driver.navigate.to "https://storiesig.info/"
|
219
|
+
sleep 2
|
220
|
+
|
221
|
+
# 2) Find the specific search input for StoriesIG
|
222
|
+
puts "Looking for search input..."
|
223
|
+
input_element = nil
|
224
|
+
|
225
|
+
# Wait for page to load and find the specific input
|
226
|
+
wait = Selenium::WebDriver::Wait.new(timeout: 10)
|
227
|
+
|
228
|
+
begin
|
229
|
+
input_element = wait.until do
|
230
|
+
element = driver.find_element(:css, 'input.search.search-form__input[placeholder*="username"]')
|
231
|
+
element if element.displayed?
|
232
|
+
end
|
233
|
+
rescue Selenium::WebDriver::Error::TimeoutError
|
234
|
+
raise "Search input not found with selector: input.search.search-form__input"
|
235
|
+
end
|
236
|
+
|
237
|
+
puts "Found search input, entering username: #{target_username}"
|
238
|
+
input_element.clear
|
239
|
+
input_element.send_keys(target_username)
|
240
|
+
|
241
|
+
# 3) Click the specific search button
|
242
|
+
puts "Looking for search button..."
|
243
|
+
begin
|
244
|
+
button_element = driver.find_element(:css, 'button.search-form__button')
|
245
|
+
puts "Found search button, clicking..."
|
246
|
+
button_element.click
|
247
|
+
rescue Selenium::WebDriver::Error::NoSuchElementError
|
248
|
+
puts "Search button not found, trying Enter key..."
|
249
|
+
input_element.send_keys(:return)
|
250
|
+
end
|
251
|
+
|
252
|
+
# 4) Wait for results to load and check different possible outcomes
|
253
|
+
puts "Waiting for results to load..."
|
254
|
+
sleep 3
|
255
|
+
|
256
|
+
# Check for various possible page states
|
257
|
+
page_state = "unknown"
|
258
|
+
error_message = nil
|
259
|
+
|
260
|
+
# Check if media items loaded
|
261
|
+
media_items = driver.find_elements(:css, 'li.profile-media-list__item')
|
262
|
+
if media_items.length > 0
|
263
|
+
page_state = "media_found"
|
264
|
+
puts "Found #{media_items.length} media items!"
|
265
|
+
else
|
266
|
+
# Check for error messages or other states
|
267
|
+
sleep 2 # Give it more time
|
268
|
+
media_items = driver.find_elements(:css, 'li.profile-media-list__item')
|
269
|
+
|
270
|
+
if media_items.length > 0
|
271
|
+
page_state = "media_found_delayed"
|
272
|
+
puts "Found #{media_items.length} media items after delay!"
|
273
|
+
else
|
274
|
+
# Look for common error indicators
|
275
|
+
error_selectors = [
|
276
|
+
'.error', '.alert', '.warning',
|
277
|
+
'[class*="error"]', '[class*="not-found"]',
|
278
|
+
'p:contains("not found")', 'div:contains("error")'
|
279
|
+
]
|
280
|
+
|
281
|
+
error_found = false
|
282
|
+
error_selectors.each do |selector|
|
283
|
+
begin
|
284
|
+
error_elements = driver.find_elements(:css, selector)
|
285
|
+
if error_elements.any?
|
286
|
+
error_message = error_elements.first.text
|
287
|
+
error_found = true
|
288
|
+
break
|
289
|
+
end
|
290
|
+
rescue
|
291
|
+
# Continue checking other selectors
|
292
|
+
end
|
293
|
+
end
|
294
|
+
|
295
|
+
if error_found
|
296
|
+
page_state = "error_found"
|
297
|
+
puts "Error found: #{error_message}"
|
298
|
+
else
|
299
|
+
page_state = "no_media"
|
300
|
+
puts "No media items found, checking page content..."
|
301
|
+
end
|
302
|
+
end
|
303
|
+
end
|
304
|
+
|
305
|
+
# 5) Extract media content from the specific structure
|
306
|
+
html = driver.page_source
|
307
|
+
doc = Nokogiri::HTML(html)
|
308
|
+
|
309
|
+
# Extract specific media items using the provided selector
|
310
|
+
media_list_items = doc.css('li.profile-media-list__item')
|
311
|
+
|
312
|
+
extracted_media = []
|
313
|
+
media_list_items.each_with_index do |item, index|
|
314
|
+
media_data = {}
|
315
|
+
|
316
|
+
# Extract image source
|
317
|
+
img_element = item.css('.media-content__image').first
|
318
|
+
if img_element
|
319
|
+
media_data[:image_url] = img_element['src']
|
320
|
+
media_data[:alt_text] = img_element['alt']
|
321
|
+
end
|
322
|
+
|
323
|
+
# Extract caption
|
324
|
+
caption_element = item.css('.media-content__caption').first
|
325
|
+
media_data[:caption] = caption_element&.text&.strip
|
326
|
+
|
327
|
+
# Extract download link
|
328
|
+
download_element = item.css('a.button.button--filled.button__download').first
|
329
|
+
media_data[:download_url] = download_element['href'] if download_element
|
330
|
+
|
331
|
+
# Extract metadata
|
332
|
+
like_element = item.css('.media-content__meta-like').first
|
333
|
+
media_data[:likes] = like_element&.text&.strip
|
334
|
+
|
335
|
+
time_element = item.css('.media-content__meta-time').first
|
336
|
+
media_data[:time] = time_element&.text&.strip
|
337
|
+
media_data[:time_title] = time_element['title'] if time_element
|
338
|
+
|
339
|
+
extracted_media << media_data unless media_data.empty?
|
340
|
+
end
|
341
|
+
|
342
|
+
# Also extract any general images and links
|
343
|
+
all_images = doc.css('img').map { |img| img['src'] }.compact.uniq.reject(&:empty?)
|
344
|
+
all_links = doc.css('a').map { |link| link['href'] }.compact.uniq.reject(&:empty?)
|
345
|
+
download_links = doc.css('a.button__download').map { |link| link['href'] }.compact.uniq
|
346
|
+
|
347
|
+
result = {
|
348
|
+
username: target_username,
|
349
|
+
method: "selenium_storiesig",
|
350
|
+
page_state: page_state,
|
351
|
+
media_items_found: extracted_media.length,
|
352
|
+
media_items: extracted_media,
|
353
|
+
all_images: all_images.select { |img| img.start_with?('http') }.first(10), # Limit output
|
354
|
+
download_links: download_links,
|
355
|
+
error_message: error_message,
|
356
|
+
success: extracted_media.length > 0,
|
357
|
+
debug_info: {
|
358
|
+
total_images: all_images.length,
|
359
|
+
total_links: all_links.length,
|
360
|
+
}
|
361
|
+
}
|
362
|
+
|
363
|
+
# Save screenshot for debugging if needed
|
364
|
+
if ENV['INSTAVIEW_DEBUG']
|
365
|
+
screenshot_path = "/tmp/instaview_debug_#{Time.now.to_i}.png"
|
366
|
+
driver.save_screenshot(screenshot_path)
|
367
|
+
result[:debug_info][:screenshot_path] = screenshot_path
|
368
|
+
puts "Debug screenshot saved to: #{screenshot_path}"
|
369
|
+
end
|
370
|
+
|
371
|
+
puts JSON.pretty_generate(result)
|
372
|
+
|
373
|
+
result
|
374
|
+
rescue => e
|
375
|
+
puts "Error: #{e.message}"
|
376
|
+
puts "Make sure Chrome/Chromium is installed for Selenium WebDriver"
|
377
|
+
raise e
|
378
|
+
ensure
|
379
|
+
driver&.quit
|
380
|
+
end
|
381
|
+
end
|
382
|
+
|
383
|
+
# @description
|
384
|
+
# Fetch StoriesIG homepage via curl and parse basic page signals (fallback method).
|
385
|
+
# @Parameter
|
386
|
+
# username: String - Instagram username (required)
|
387
|
+
# @Return values
|
388
|
+
# Hash - Basic page analysis and sample assets; primarily for diagnostics
|
389
|
+
# @Errors
|
390
|
+
# ArgumentError - if `username` is nil or empty
|
391
|
+
# StandardError - if the curl command fails or returns empty content
|
392
|
+
def self.scrape_with_simple_http(username = nil)
|
393
|
+
target_username = username
|
394
|
+
throw ArgumentError, "Username is required for simple HTTP method" if target_username.nil? || target_username.empty?
|
395
|
+
begin
|
396
|
+
# Simple HTTP approach using curl
|
397
|
+
puts "Trying to fetch page with curl..."
|
398
|
+
|
399
|
+
curl_command = "curl -s -L -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' 'https://storiesig.info/'"
|
400
|
+
|
401
|
+
html_content = `#{curl_command}`
|
402
|
+
|
403
|
+
if $?.success? && !html_content.empty?
|
404
|
+
doc = Nokogiri::HTML(html_content)
|
405
|
+
|
406
|
+
# Extract basic page information
|
407
|
+
title = doc.css('title').text
|
408
|
+
forms = doc.css('form')
|
409
|
+
inputs = doc.css('input[type="text"], input[name*="user"]')
|
410
|
+
|
411
|
+
# Look for any existing media or links
|
412
|
+
images = doc.css('img').map { |img| img['src'] }.compact.select { |src| src.start_with?('http') }
|
413
|
+
links = doc.css('a').map { |link| link['href'] }.compact.select { |href| href.include?('instagram') || href.include?('media') }
|
414
|
+
|
415
|
+
result = {
|
416
|
+
username: target_username,
|
417
|
+
method: "simple_http_curl",
|
418
|
+
forms_found: forms.length,
|
419
|
+
inputs_found: inputs.length,
|
420
|
+
sample_images: images.first(3),
|
421
|
+
message: "Simple HTTP method using curl - shows page structure. For full automation use selenium method."
|
422
|
+
}
|
423
|
+
|
424
|
+
puts JSON.pretty_generate(result)
|
425
|
+
result
|
426
|
+
else
|
427
|
+
raise "Curl command failed or returned empty content"
|
428
|
+
end
|
429
|
+
rescue => e
|
430
|
+
puts "Error with simple HTTP method: #{e.message}"
|
431
|
+
puts "Try using scrape_instagram_stories method instead"
|
432
|
+
raise e
|
433
|
+
end
|
434
|
+
end
|
435
|
+
|
436
|
+
# @description
|
437
|
+
# Verify gem wiring and surface available methods and version info.
|
438
|
+
# @Parameter
|
439
|
+
# None
|
440
|
+
# @Return values
|
441
|
+
# Hash - Connectivity report
|
442
|
+
# @Errors
|
443
|
+
# None
|
444
|
+
def self.test_connectivity
|
445
|
+
# Simple test method to verify the gem works
|
446
|
+
puts "Testing Instaview gem connectivity..."
|
447
|
+
|
448
|
+
result = {
|
449
|
+
gem_name: "Instaview",
|
450
|
+
version: Instaview::VERSION,
|
451
|
+
methods_available: [
|
452
|
+
"scrape_instagram_stories",
|
453
|
+
"scrape_with_simple_http",
|
454
|
+
"fetch_data_async",
|
455
|
+
"get_from_cache_or_async",
|
456
|
+
"load_from_cache_only",
|
457
|
+
"getData",
|
458
|
+
"test_connectivity"
|
459
|
+
],
|
460
|
+
status: "OK"
|
461
|
+
}
|
462
|
+
|
463
|
+
puts JSON.pretty_generate(result)
|
464
|
+
result
|
465
|
+
end
|
466
|
+
|
467
|
+
# @description
|
468
|
+
# Legacy parsing example for instaview.me; currently a stub.
|
469
|
+
# @Parameter
|
470
|
+
# None
|
471
|
+
# @Return values
|
472
|
+
# nil
|
473
|
+
# @Errors
|
474
|
+
# StandardError - on network/read @errors
|
475
|
+
def self.parseData
|
476
|
+
# Using a third-party web app, to get Instagram data.
|
477
|
+
# Afterwards, we use Nokogiri to parse the HTML.
|
478
|
+
require "nokogiri"
|
479
|
+
require "open-uri"
|
480
|
+
|
481
|
+
url = "https://www.instaview.me/"
|
482
|
+
html = URI.open(url)
|
483
|
+
doc = Nokogiri::HTML(html)
|
484
|
+
|
485
|
+
doc.xpath("//profile-media-list__item").each do |item|
|
486
|
+
puts item.text
|
487
|
+
end
|
488
|
+
end
|
489
|
+
end
|
metadata
ADDED
@@ -0,0 +1,111 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: instaview
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Nicolas Reiner
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2025-10-01 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: httparty
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0.21'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0.21'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: nokogiri
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.15'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.15'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: json
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '2.0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '2.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: selenium-webdriver
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '4.0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '4.0'
|
69
|
+
description: A Ruby gem that uses Selenium to scrape Instagram stories and media from
|
70
|
+
third-party services
|
71
|
+
email:
|
72
|
+
- nici.ferd@gmail.com
|
73
|
+
executables: []
|
74
|
+
extensions: []
|
75
|
+
extra_rdoc_files: []
|
76
|
+
files:
|
77
|
+
- CHANGELOG.md
|
78
|
+
- CODE_OF_CONDUCT.md
|
79
|
+
- LICENSE.txt
|
80
|
+
- README.md
|
81
|
+
- Rakefile
|
82
|
+
- lib/instaview.rb
|
83
|
+
- lib/instaview/version.rb
|
84
|
+
homepage: https://github.com/nicolasreiner/instaview
|
85
|
+
licenses:
|
86
|
+
- MIT
|
87
|
+
metadata:
|
88
|
+
allowed_push_host: https://rubygems.org
|
89
|
+
homepage_uri: https://github.com/nicolasreiner/instaview
|
90
|
+
source_code_uri: https://github.com/nicolasreiner/instaview
|
91
|
+
changelog_uri: https://github.com/nicolasreiner/instaview/blob/main/CHANGELOG.md
|
92
|
+
post_install_message:
|
93
|
+
rdoc_options: []
|
94
|
+
require_paths:
|
95
|
+
- lib
|
96
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
97
|
+
requirements:
|
98
|
+
- - ">="
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: 3.2.0
|
101
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
102
|
+
requirements:
|
103
|
+
- - ">="
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
version: '0'
|
106
|
+
requirements: []
|
107
|
+
rubygems_version: 3.4.20
|
108
|
+
signing_key:
|
109
|
+
specification_version: 4
|
110
|
+
summary: Instagram viewer gem using Selenium for scraping Instagram stories and media
|
111
|
+
test_files: []
|