instaview 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/instaview/version.rb +1 -1
- data/lib/instaview.rb +100 -117
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8a2e76b7660b0fc4f9b682c5e7ff1d1385cef855773cd6943d622e50e7ce79da
|
4
|
+
data.tar.gz: 375a1a3a9e60601d4260e1191b6941a23300de096caadb8096c13afe059ab99d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 91a033d9f63e75c33b7d64d3442a4051f5257e5bd6ab81967959881c54d4d574d352d6d8cb028078452b83ae67b3d23bb9fa53e967f731f650e054b56336c44a
|
7
|
+
data.tar.gz: f055ac7ccef5ed4f6da9d3547bd88dbd2a879c0aad68e457413c105d39c63085119a62af27bbcfe0fbd23c346f5a3036556244cc7ff8107087a212f9f254f005
|
data/lib/instaview/version.rb
CHANGED
data/lib/instaview.rb
CHANGED
@@ -8,6 +8,7 @@ require 'net/http'
|
|
8
8
|
require 'uri'
|
9
9
|
require 'fileutils'
|
10
10
|
require 'time'
|
11
|
+
require 'thread'
|
11
12
|
|
12
13
|
module Instaview
|
13
14
|
class Error < StandardError; end
|
@@ -41,28 +42,25 @@ module Instaview
|
|
41
42
|
raise ArgumentError, "username is required" if username.nil? || username.to_s.strip.empty?
|
42
43
|
|
43
44
|
Thread.new do
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
45
|
+
result = case method
|
46
|
+
when :selenium
|
47
|
+
scrape_instagram_stories(username)
|
48
|
+
when :simple_http
|
49
|
+
scrape_with_simple_http(username)
|
50
|
+
else
|
51
|
+
scrape_instagram_stories(username)
|
52
|
+
end
|
53
|
+
|
54
|
+
# Persist to cache on success
|
55
|
+
if data_found?(result)
|
55
56
|
begin
|
56
57
|
write_to_cache(username, result)
|
57
|
-
rescue
|
58
|
-
|
58
|
+
rescue StandardError
|
59
|
+
# Ignore cache write failures to avoid affecting callers
|
59
60
|
end
|
60
|
-
|
61
|
-
result
|
62
|
-
rescue => e
|
63
|
-
warn "Instaview: async fetch failed for #{username}: #{e.message}"
|
64
|
-
raise
|
65
61
|
end
|
62
|
+
|
63
|
+
result
|
66
64
|
end
|
67
65
|
end
|
68
66
|
|
@@ -79,11 +77,8 @@ module Instaview
|
|
79
77
|
def self.get_from_cache_or_async(username, max_age_hours: 12, method: :selenium)
|
80
78
|
max_age_seconds = (max_age_hours.to_i * 3600)
|
81
79
|
cached = read_from_cache(username, max_age_seconds: max_age_seconds)
|
82
|
-
if cached
|
83
|
-
|
84
|
-
return cached
|
85
|
-
end
|
86
|
-
puts "No valid cache found for #{username}, fetching data..."
|
80
|
+
return cached if cached
|
81
|
+
|
87
82
|
t = fetch_data_async(username, method: method)
|
88
83
|
t.value # join and return result
|
89
84
|
end
|
@@ -147,6 +142,7 @@ module Instaview
|
|
147
142
|
|
148
143
|
content = File.read(path)
|
149
144
|
data = JSON.parse(content, symbolize_names: true)
|
145
|
+
return nil unless data_found?(data)
|
150
146
|
# annotate so callers can tell it came from cache
|
151
147
|
if data.is_a?(Hash)
|
152
148
|
data[:cached] = true
|
@@ -171,6 +167,29 @@ module Instaview
|
|
171
167
|
true
|
172
168
|
end
|
173
169
|
|
170
|
+
def self.data_found?(data)
|
171
|
+
return false unless data.is_a?(Hash)
|
172
|
+
|
173
|
+
success = data[:success]
|
174
|
+
return true if success == true
|
175
|
+
|
176
|
+
method = data[:method]
|
177
|
+
|
178
|
+
if method == "selenium_storiesig"
|
179
|
+
media_found = data[:media_items_found].to_i > 0
|
180
|
+
media_present = data[:media_items].is_a?(Array) && !data[:media_items].empty?
|
181
|
+
return true if media_found || media_present
|
182
|
+
elsif method == "simple_http_curl"
|
183
|
+
forms_found = data[:forms_found].to_i > 0
|
184
|
+
inputs_found = data[:inputs_found].to_i > 0
|
185
|
+
samples_present = data[:sample_images].is_a?(Array) && !data[:sample_images].empty?
|
186
|
+
return true if forms_found || inputs_found || samples_present
|
187
|
+
end
|
188
|
+
|
189
|
+
false
|
190
|
+
end
|
191
|
+
private_class_method :data_found?
|
192
|
+
|
174
193
|
# @description
|
175
194
|
# Use Selenium WebDriver to automate StoriesIG and extract media details for a username.
|
176
195
|
# @Parameter
|
@@ -178,10 +197,11 @@ module Instaview
|
|
178
197
|
# @Return values
|
179
198
|
# Hash - Structured result including extracted media and metadata
|
180
199
|
# @Errors
|
181
|
-
#
|
200
|
+
# Instaview::Error - on Selenium/WebDriver failures or selector timeouts
|
182
201
|
def self.scrape_instagram_stories(username = nil)
|
183
202
|
target_username = username || ARGV[0] # pass username as argument
|
184
203
|
|
204
|
+
driver = nil
|
185
205
|
begin
|
186
206
|
# Setup Selenium WebDriver with headless Chrome
|
187
207
|
options = Selenium::WebDriver::Chrome::Options.new
|
@@ -204,14 +224,10 @@ module Instaview
|
|
204
224
|
"/usr/bin/chromium-browser",
|
205
225
|
"/usr/bin/google-chrome"
|
206
226
|
]
|
207
|
-
|
227
|
+
|
208
228
|
chrome_binary = chrome_paths.find { |path| File.exist?(path) }
|
209
|
-
|
210
|
-
|
211
|
-
options.binary = chrome_binary
|
212
|
-
puts "Using Chrome binary: #{chrome_binary}"
|
213
|
-
end
|
214
|
-
|
229
|
+
options.binary = chrome_binary if chrome_binary
|
230
|
+
|
215
231
|
driver = Selenium::WebDriver.for :chrome, options: options
|
216
232
|
|
217
233
|
# 1) Go to StoriesIG homepage
|
@@ -219,65 +235,54 @@ module Instaview
|
|
219
235
|
sleep 2
|
220
236
|
|
221
237
|
# 2) Find the specific search input for StoriesIG
|
222
|
-
puts "Looking for search input..."
|
223
|
-
input_element = nil
|
224
|
-
|
225
|
-
# Wait for page to load and find the specific input
|
226
238
|
wait = Selenium::WebDriver::Wait.new(timeout: 10)
|
227
|
-
|
228
|
-
begin
|
229
|
-
|
239
|
+
|
240
|
+
input_element = begin
|
241
|
+
wait.until do
|
230
242
|
element = driver.find_element(:css, 'input.search.search-form__input[placeholder*="username"]')
|
231
243
|
element if element.displayed?
|
232
244
|
end
|
233
245
|
rescue Selenium::WebDriver::Error::TimeoutError
|
234
|
-
raise "Search input not found with selector: input.search.search-form__input"
|
246
|
+
raise Instaview::Error, "Search input not found with selector: input.search.search-form__input"
|
235
247
|
end
|
236
248
|
|
237
|
-
puts "Found search input, entering username: #{target_username}"
|
238
249
|
input_element.clear
|
239
250
|
input_element.send_keys(target_username)
|
240
251
|
|
241
252
|
# 3) Click the specific search button
|
242
|
-
puts "Looking for search button..."
|
243
253
|
begin
|
244
254
|
button_element = driver.find_element(:css, 'button.search-form__button')
|
245
|
-
puts "Found search button, clicking..."
|
246
255
|
button_element.click
|
247
256
|
rescue Selenium::WebDriver::Error::NoSuchElementError
|
248
|
-
puts "Search button not found, trying Enter key..."
|
249
257
|
input_element.send_keys(:return)
|
250
258
|
end
|
251
259
|
|
252
260
|
# 4) Wait for results to load and check different possible outcomes
|
253
|
-
puts "Waiting for results to load..."
|
254
261
|
sleep 3
|
255
|
-
|
262
|
+
|
256
263
|
# Check for various possible page states
|
257
264
|
page_state = "unknown"
|
258
265
|
error_message = nil
|
259
|
-
|
266
|
+
|
260
267
|
# Check if media items loaded
|
261
268
|
media_items = driver.find_elements(:css, 'li.profile-media-list__item')
|
262
269
|
if media_items.length > 0
|
263
270
|
page_state = "media_found"
|
264
|
-
puts "Found #{media_items.length} media items!"
|
265
271
|
else
|
266
272
|
# Check for error messages or other states
|
267
273
|
sleep 2 # Give it more time
|
268
274
|
media_items = driver.find_elements(:css, 'li.profile-media-list__item')
|
269
|
-
|
275
|
+
|
270
276
|
if media_items.length > 0
|
271
277
|
page_state = "media_found_delayed"
|
272
|
-
puts "Found #{media_items.length} media items after delay!"
|
273
278
|
else
|
274
279
|
# Look for common error indicators
|
275
280
|
error_selectors = [
|
276
|
-
'.error', '.alert', '.warning',
|
281
|
+
'.error', '.alert', '.warning',
|
277
282
|
'[class*="error"]', '[class*="not-found"]',
|
278
283
|
'p:contains("not found")', 'div:contains("error")'
|
279
284
|
]
|
280
|
-
|
285
|
+
|
281
286
|
error_found = false
|
282
287
|
error_selectors.each do |selector|
|
283
288
|
begin
|
@@ -287,18 +292,12 @@ module Instaview
|
|
287
292
|
error_found = true
|
288
293
|
break
|
289
294
|
end
|
290
|
-
rescue
|
295
|
+
rescue StandardError
|
291
296
|
# Continue checking other selectors
|
292
297
|
end
|
293
298
|
end
|
294
|
-
|
295
|
-
|
296
|
-
page_state = "error_found"
|
297
|
-
puts "Error found: #{error_message}"
|
298
|
-
else
|
299
|
-
page_state = "no_media"
|
300
|
-
puts "No media items found, checking page content..."
|
301
|
-
end
|
299
|
+
|
300
|
+
page_state = error_found ? "error_found" : "no_media"
|
302
301
|
end
|
303
302
|
end
|
304
303
|
|
@@ -308,34 +307,34 @@ module Instaview
|
|
308
307
|
|
309
308
|
# Extract specific media items using the provided selector
|
310
309
|
media_list_items = doc.css('li.profile-media-list__item')
|
311
|
-
|
310
|
+
|
312
311
|
extracted_media = []
|
313
|
-
media_list_items.
|
312
|
+
media_list_items.each do |item|
|
314
313
|
media_data = {}
|
315
|
-
|
314
|
+
|
316
315
|
# Extract image source
|
317
316
|
img_element = item.css('.media-content__image').first
|
318
317
|
if img_element
|
319
318
|
media_data[:image_url] = img_element['src']
|
320
319
|
media_data[:alt_text] = img_element['alt']
|
321
320
|
end
|
322
|
-
|
321
|
+
|
323
322
|
# Extract caption
|
324
323
|
caption_element = item.css('.media-content__caption').first
|
325
324
|
media_data[:caption] = caption_element&.text&.strip
|
326
|
-
|
325
|
+
|
327
326
|
# Extract download link
|
328
327
|
download_element = item.css('a.button.button--filled.button__download').first
|
329
328
|
media_data[:download_url] = download_element['href'] if download_element
|
330
|
-
|
329
|
+
|
331
330
|
# Extract metadata
|
332
331
|
like_element = item.css('.media-content__meta-like').first
|
333
332
|
media_data[:likes] = like_element&.text&.strip
|
334
|
-
|
333
|
+
|
335
334
|
time_element = item.css('.media-content__meta-time').first
|
336
335
|
media_data[:time] = time_element&.text&.strip
|
337
336
|
media_data[:time_title] = time_element['title'] if time_element
|
338
|
-
|
337
|
+
|
339
338
|
extracted_media << media_data unless media_data.empty?
|
340
339
|
end
|
341
340
|
|
@@ -365,16 +364,13 @@ module Instaview
|
|
365
364
|
screenshot_path = "/tmp/instaview_debug_#{Time.now.to_i}.png"
|
366
365
|
driver.save_screenshot(screenshot_path)
|
367
366
|
result[:debug_info][:screenshot_path] = screenshot_path
|
368
|
-
puts "Debug screenshot saved to: #{screenshot_path}"
|
369
367
|
end
|
370
368
|
|
371
|
-
puts JSON.pretty_generate(result)
|
372
|
-
|
373
369
|
result
|
370
|
+
rescue Instaview::Error
|
371
|
+
raise
|
374
372
|
rescue => e
|
375
|
-
|
376
|
-
puts "Make sure Chrome/Chromium is installed for Selenium WebDriver"
|
377
|
-
raise e
|
373
|
+
raise Instaview::Error, "Selenium scraping failed: #{e.message}"
|
378
374
|
ensure
|
379
375
|
driver&.quit
|
380
376
|
end
|
@@ -388,48 +384,42 @@ module Instaview
|
|
388
384
|
# Hash - Basic page analysis and sample assets; primarily for diagnostics
|
389
385
|
# @Errors
|
390
386
|
# ArgumentError - if `username` is nil or empty
|
391
|
-
#
|
387
|
+
# Instaview::Error - if the curl command fails or other errors occur
|
392
388
|
def self.scrape_with_simple_http(username = nil)
|
393
389
|
target_username = username
|
394
|
-
|
390
|
+
raise ArgumentError, "Username is required for simple HTTP method" if target_username.nil? || target_username.empty?
|
391
|
+
|
395
392
|
begin
|
396
393
|
# Simple HTTP approach using curl
|
397
|
-
puts "Trying to fetch page with curl..."
|
398
|
-
|
399
394
|
curl_command = "curl -s -L -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' 'https://storiesig.info/'"
|
400
|
-
|
395
|
+
|
401
396
|
html_content = `#{curl_command}`
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
# Extract basic page information
|
407
|
-
title = doc.css('title').text
|
408
|
-
forms = doc.css('form')
|
409
|
-
inputs = doc.css('input[type="text"], input[name*="user"]')
|
410
|
-
|
411
|
-
# Look for any existing media or links
|
412
|
-
images = doc.css('img').map { |img| img['src'] }.compact.select { |src| src.start_with?('http') }
|
413
|
-
links = doc.css('a').map { |link| link['href'] }.compact.select { |href| href.include?('instagram') || href.include?('media') }
|
414
|
-
|
415
|
-
result = {
|
416
|
-
username: target_username,
|
417
|
-
method: "simple_http_curl",
|
418
|
-
forms_found: forms.length,
|
419
|
-
inputs_found: inputs.length,
|
420
|
-
sample_images: images.first(3),
|
421
|
-
message: "Simple HTTP method using curl - shows page structure. For full automation use selenium method."
|
422
|
-
}
|
423
|
-
|
424
|
-
puts JSON.pretty_generate(result)
|
425
|
-
result
|
426
|
-
else
|
427
|
-
raise "Curl command failed or returned empty content"
|
397
|
+
|
398
|
+
unless $?.success? && !html_content.empty?
|
399
|
+
raise Instaview::Error, "Curl command failed or returned empty content"
|
428
400
|
end
|
401
|
+
|
402
|
+
doc = Nokogiri::HTML(html_content)
|
403
|
+
|
404
|
+
# Extract basic page information
|
405
|
+
forms = doc.css('form')
|
406
|
+
inputs = doc.css('input[type="text"], input[name*="user"]')
|
407
|
+
|
408
|
+
# Look for any existing media or links
|
409
|
+
images = doc.css('img').map { |img| img['src'] }.compact.select { |src| src.start_with?('http') }
|
410
|
+
|
411
|
+
{
|
412
|
+
username: target_username,
|
413
|
+
method: "simple_http_curl",
|
414
|
+
forms_found: forms.length,
|
415
|
+
inputs_found: inputs.length,
|
416
|
+
sample_images: images.first(3),
|
417
|
+
message: "Simple HTTP method using curl - shows page structure. For full automation use selenium method."
|
418
|
+
}
|
419
|
+
rescue Instaview::Error
|
420
|
+
raise
|
429
421
|
rescue => e
|
430
|
-
|
431
|
-
puts "Try using scrape_instagram_stories method instead"
|
432
|
-
raise e
|
422
|
+
raise Instaview::Error, "HTTP scraping failed: #{e.message}"
|
433
423
|
end
|
434
424
|
end
|
435
425
|
|
@@ -443,9 +433,7 @@ module Instaview
|
|
443
433
|
# None
|
444
434
|
def self.test_connectivity
|
445
435
|
# Simple test method to verify the gem works
|
446
|
-
|
447
|
-
|
448
|
-
result = {
|
436
|
+
{
|
449
437
|
gem_name: "Instaview",
|
450
438
|
version: Instaview::VERSION,
|
451
439
|
methods_available: [
|
@@ -459,9 +447,6 @@ module Instaview
|
|
459
447
|
],
|
460
448
|
status: "OK"
|
461
449
|
}
|
462
|
-
|
463
|
-
puts JSON.pretty_generate(result)
|
464
|
-
result
|
465
450
|
end
|
466
451
|
|
467
452
|
# @description
|
@@ -482,8 +467,6 @@ module Instaview
|
|
482
467
|
html = URI.open(url)
|
483
468
|
doc = Nokogiri::HTML(html)
|
484
469
|
|
485
|
-
doc.xpath("//profile-media-list__item").
|
486
|
-
puts item.text
|
487
|
-
end
|
470
|
+
doc.xpath("//profile-media-list__item").map(&:text)
|
488
471
|
end
|
489
472
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: instaview
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nicolas Reiner
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-10-
|
11
|
+
date: 2025-10-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: httparty
|