instaview 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6ecbb088043452c55b479846568262489b6c217798de96ccc0acdc7a6dfcfffb
4
- data.tar.gz: b3353cca7a6219a42e4d5c31de08a3136b6760ba8f7fd3fc732febcb1881dd04
3
+ metadata.gz: 8a2e76b7660b0fc4f9b682c5e7ff1d1385cef855773cd6943d622e50e7ce79da
4
+ data.tar.gz: 375a1a3a9e60601d4260e1191b6941a23300de096caadb8096c13afe059ab99d
5
5
  SHA512:
6
- metadata.gz: 4430729542295f37cd35935cbb5589edf796c0940d87153281305c249697b1eaa257f4dba81dd8c15250954d17b8962fa56113d8d2ad3609d712e0f266243f8b
7
- data.tar.gz: e1977fc370f81854906d66d983e5a08a3bb49aeacb1ae4faa9a2d1c144584ab3299588377bd10e829649cb86026be3e17e01b4c18865ce138d05748f32774904
6
+ metadata.gz: 91a033d9f63e75c33b7d64d3442a4051f5257e5bd6ab81967959881c54d4d574d352d6d8cb028078452b83ae67b3d23bb9fa53e967f731f650e054b56336c44a
7
+ data.tar.gz: f055ac7ccef5ed4f6da9d3547bd88dbd2a879c0aad68e457413c105d39c63085119a62af27bbcfe0fbd23c346f5a3036556244cc7ff8107087a212f9f254f005
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Instaview
4
- VERSION = "0.1.0"
4
+ VERSION = "0.2.0"
5
5
  end
data/lib/instaview.rb CHANGED
@@ -8,6 +8,7 @@ require 'net/http'
8
8
  require 'uri'
9
9
  require 'fileutils'
10
10
  require 'time'
11
+ require 'thread'
11
12
 
12
13
  module Instaview
13
14
  class Error < StandardError; end
@@ -41,28 +42,25 @@ module Instaview
41
42
  raise ArgumentError, "username is required" if username.nil? || username.to_s.strip.empty?
42
43
 
43
44
  Thread.new do
44
- begin
45
- result = case method
46
- when :selenium
47
- scrape_instagram_stories(username)
48
- when :simple_http
49
- scrape_with_simple_http(username)
50
- else
51
- scrape_instagram_stories(username)
52
- end
53
-
54
- # Persist to cache on success
45
+ result = case method
46
+ when :selenium
47
+ scrape_instagram_stories(username)
48
+ when :simple_http
49
+ scrape_with_simple_http(username)
50
+ else
51
+ scrape_instagram_stories(username)
52
+ end
53
+
54
+ # Persist to cache on success
55
+ if data_found?(result)
55
56
  begin
56
57
  write_to_cache(username, result)
57
- rescue => e
58
- warn "Instaview: failed to write cache for #{username}: #{e.message}"
58
+ rescue StandardError
59
+ # Ignore cache write failures to avoid affecting callers
59
60
  end
60
-
61
- result
62
- rescue => e
63
- warn "Instaview: async fetch failed for #{username}: #{e.message}"
64
- raise
65
61
  end
62
+
63
+ result
66
64
  end
67
65
  end
68
66
 
@@ -79,11 +77,8 @@ module Instaview
79
77
  def self.get_from_cache_or_async(username, max_age_hours: 12, method: :selenium)
80
78
  max_age_seconds = (max_age_hours.to_i * 3600)
81
79
  cached = read_from_cache(username, max_age_seconds: max_age_seconds)
82
- if cached
83
- puts "Using cached data for #{username}"
84
- return cached
85
- end
86
- puts "No valid cache found for #{username}, fetching data..."
80
+ return cached if cached
81
+
87
82
  t = fetch_data_async(username, method: method)
88
83
  t.value # join and return result
89
84
  end
@@ -147,6 +142,7 @@ module Instaview
147
142
 
148
143
  content = File.read(path)
149
144
  data = JSON.parse(content, symbolize_names: true)
145
+ return nil unless data_found?(data)
150
146
  # annotate so callers can tell it came from cache
151
147
  if data.is_a?(Hash)
152
148
  data[:cached] = true
@@ -171,6 +167,29 @@ module Instaview
171
167
  true
172
168
  end
173
169
 
170
+ def self.data_found?(data)
171
+ return false unless data.is_a?(Hash)
172
+
173
+ success = data[:success]
174
+ return true if success == true
175
+
176
+ method = data[:method]
177
+
178
+ if method == "selenium_storiesig"
179
+ media_found = data[:media_items_found].to_i > 0
180
+ media_present = data[:media_items].is_a?(Array) && !data[:media_items].empty?
181
+ return true if media_found || media_present
182
+ elsif method == "simple_http_curl"
183
+ forms_found = data[:forms_found].to_i > 0
184
+ inputs_found = data[:inputs_found].to_i > 0
185
+ samples_present = data[:sample_images].is_a?(Array) && !data[:sample_images].empty?
186
+ return true if forms_found || inputs_found || samples_present
187
+ end
188
+
189
+ false
190
+ end
191
+ private_class_method :data_found?
192
+
174
193
  # @description
175
194
  # Use Selenium WebDriver to automate StoriesIG and extract media details for a username.
176
195
  # @Parameter
@@ -178,10 +197,11 @@ module Instaview
178
197
  # @Return values
179
198
  # Hash - Structured result including extracted media and metadata
180
199
  # @Errors
181
- # StandardError - on Selenium/WebDriver failures or selector timeouts
200
+ # Instaview::Error - on Selenium/WebDriver failures or selector timeouts
182
201
  def self.scrape_instagram_stories(username = nil)
183
202
  target_username = username || ARGV[0] # pass username as argument
184
203
 
204
+ driver = nil
185
205
  begin
186
206
  # Setup Selenium WebDriver with headless Chrome
187
207
  options = Selenium::WebDriver::Chrome::Options.new
@@ -204,14 +224,10 @@ module Instaview
204
224
  "/usr/bin/chromium-browser",
205
225
  "/usr/bin/google-chrome"
206
226
  ]
207
-
227
+
208
228
  chrome_binary = chrome_paths.find { |path| File.exist?(path) }
209
-
210
- if chrome_binary
211
- options.binary = chrome_binary
212
- puts "Using Chrome binary: #{chrome_binary}"
213
- end
214
-
229
+ options.binary = chrome_binary if chrome_binary
230
+
215
231
  driver = Selenium::WebDriver.for :chrome, options: options
216
232
 
217
233
  # 1) Go to StoriesIG homepage
@@ -219,65 +235,54 @@ module Instaview
219
235
  sleep 2
220
236
 
221
237
  # 2) Find the specific search input for StoriesIG
222
- puts "Looking for search input..."
223
- input_element = nil
224
-
225
- # Wait for page to load and find the specific input
226
238
  wait = Selenium::WebDriver::Wait.new(timeout: 10)
227
-
228
- begin
229
- input_element = wait.until do
239
+
240
+ input_element = begin
241
+ wait.until do
230
242
  element = driver.find_element(:css, 'input.search.search-form__input[placeholder*="username"]')
231
243
  element if element.displayed?
232
244
  end
233
245
  rescue Selenium::WebDriver::Error::TimeoutError
234
- raise "Search input not found with selector: input.search.search-form__input"
246
+ raise Instaview::Error, "Search input not found with selector: input.search.search-form__input"
235
247
  end
236
248
 
237
- puts "Found search input, entering username: #{target_username}"
238
249
  input_element.clear
239
250
  input_element.send_keys(target_username)
240
251
 
241
252
  # 3) Click the specific search button
242
- puts "Looking for search button..."
243
253
  begin
244
254
  button_element = driver.find_element(:css, 'button.search-form__button')
245
- puts "Found search button, clicking..."
246
255
  button_element.click
247
256
  rescue Selenium::WebDriver::Error::NoSuchElementError
248
- puts "Search button not found, trying Enter key..."
249
257
  input_element.send_keys(:return)
250
258
  end
251
259
 
252
260
  # 4) Wait for results to load and check different possible outcomes
253
- puts "Waiting for results to load..."
254
261
  sleep 3
255
-
262
+
256
263
  # Check for various possible page states
257
264
  page_state = "unknown"
258
265
  error_message = nil
259
-
266
+
260
267
  # Check if media items loaded
261
268
  media_items = driver.find_elements(:css, 'li.profile-media-list__item')
262
269
  if media_items.length > 0
263
270
  page_state = "media_found"
264
- puts "Found #{media_items.length} media items!"
265
271
  else
266
272
  # Check for error messages or other states
267
273
  sleep 2 # Give it more time
268
274
  media_items = driver.find_elements(:css, 'li.profile-media-list__item')
269
-
275
+
270
276
  if media_items.length > 0
271
277
  page_state = "media_found_delayed"
272
- puts "Found #{media_items.length} media items after delay!"
273
278
  else
274
279
  # Look for common error indicators
275
280
  error_selectors = [
276
- '.error', '.alert', '.warning',
281
+ '.error', '.alert', '.warning',
277
282
  '[class*="error"]', '[class*="not-found"]',
278
283
  'p:contains("not found")', 'div:contains("error")'
279
284
  ]
280
-
285
+
281
286
  error_found = false
282
287
  error_selectors.each do |selector|
283
288
  begin
@@ -287,18 +292,12 @@ module Instaview
287
292
  error_found = true
288
293
  break
289
294
  end
290
- rescue
295
+ rescue StandardError
291
296
  # Continue checking other selectors
292
297
  end
293
298
  end
294
-
295
- if error_found
296
- page_state = "error_found"
297
- puts "Error found: #{error_message}"
298
- else
299
- page_state = "no_media"
300
- puts "No media items found, checking page content..."
301
- end
299
+
300
+ page_state = error_found ? "error_found" : "no_media"
302
301
  end
303
302
  end
304
303
 
@@ -308,34 +307,34 @@ module Instaview
308
307
 
309
308
  # Extract specific media items using the provided selector
310
309
  media_list_items = doc.css('li.profile-media-list__item')
311
-
310
+
312
311
  extracted_media = []
313
- media_list_items.each_with_index do |item, index|
312
+ media_list_items.each do |item|
314
313
  media_data = {}
315
-
314
+
316
315
  # Extract image source
317
316
  img_element = item.css('.media-content__image').first
318
317
  if img_element
319
318
  media_data[:image_url] = img_element['src']
320
319
  media_data[:alt_text] = img_element['alt']
321
320
  end
322
-
321
+
323
322
  # Extract caption
324
323
  caption_element = item.css('.media-content__caption').first
325
324
  media_data[:caption] = caption_element&.text&.strip
326
-
325
+
327
326
  # Extract download link
328
327
  download_element = item.css('a.button.button--filled.button__download').first
329
328
  media_data[:download_url] = download_element['href'] if download_element
330
-
329
+
331
330
  # Extract metadata
332
331
  like_element = item.css('.media-content__meta-like').first
333
332
  media_data[:likes] = like_element&.text&.strip
334
-
333
+
335
334
  time_element = item.css('.media-content__meta-time').first
336
335
  media_data[:time] = time_element&.text&.strip
337
336
  media_data[:time_title] = time_element['title'] if time_element
338
-
337
+
339
338
  extracted_media << media_data unless media_data.empty?
340
339
  end
341
340
 
@@ -365,16 +364,13 @@ module Instaview
365
364
  screenshot_path = "/tmp/instaview_debug_#{Time.now.to_i}.png"
366
365
  driver.save_screenshot(screenshot_path)
367
366
  result[:debug_info][:screenshot_path] = screenshot_path
368
- puts "Debug screenshot saved to: #{screenshot_path}"
369
367
  end
370
368
 
371
- puts JSON.pretty_generate(result)
372
-
373
369
  result
370
+ rescue Instaview::Error
371
+ raise
374
372
  rescue => e
375
- puts "Error: #{e.message}"
376
- puts "Make sure Chrome/Chromium is installed for Selenium WebDriver"
377
- raise e
373
+ raise Instaview::Error, "Selenium scraping failed: #{e.message}"
378
374
  ensure
379
375
  driver&.quit
380
376
  end
@@ -388,48 +384,42 @@ module Instaview
388
384
  # Hash - Basic page analysis and sample assets; primarily for diagnostics
389
385
  # @Errors
390
386
  # ArgumentError - if `username` is nil or empty
391
- # StandardError - if the curl command fails or returns empty content
387
+ # Instaview::Error - if the curl command fails or other errors occur
392
388
  def self.scrape_with_simple_http(username = nil)
393
389
  target_username = username
394
- throw ArgumentError, "Username is required for simple HTTP method" if target_username.nil? || target_username.empty?
390
+ raise ArgumentError, "Username is required for simple HTTP method" if target_username.nil? || target_username.empty?
391
+
395
392
  begin
396
393
  # Simple HTTP approach using curl
397
- puts "Trying to fetch page with curl..."
398
-
399
394
  curl_command = "curl -s -L -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' 'https://storiesig.info/'"
400
-
395
+
401
396
  html_content = `#{curl_command}`
402
-
403
- if $?.success? && !html_content.empty?
404
- doc = Nokogiri::HTML(html_content)
405
-
406
- # Extract basic page information
407
- title = doc.css('title').text
408
- forms = doc.css('form')
409
- inputs = doc.css('input[type="text"], input[name*="user"]')
410
-
411
- # Look for any existing media or links
412
- images = doc.css('img').map { |img| img['src'] }.compact.select { |src| src.start_with?('http') }
413
- links = doc.css('a').map { |link| link['href'] }.compact.select { |href| href.include?('instagram') || href.include?('media') }
414
-
415
- result = {
416
- username: target_username,
417
- method: "simple_http_curl",
418
- forms_found: forms.length,
419
- inputs_found: inputs.length,
420
- sample_images: images.first(3),
421
- message: "Simple HTTP method using curl - shows page structure. For full automation use selenium method."
422
- }
423
-
424
- puts JSON.pretty_generate(result)
425
- result
426
- else
427
- raise "Curl command failed or returned empty content"
397
+
398
+ unless $?.success? && !html_content.empty?
399
+ raise Instaview::Error, "Curl command failed or returned empty content"
428
400
  end
401
+
402
+ doc = Nokogiri::HTML(html_content)
403
+
404
+ # Extract basic page information
405
+ forms = doc.css('form')
406
+ inputs = doc.css('input[type="text"], input[name*="user"]')
407
+
408
+ # Look for any existing media or links
409
+ images = doc.css('img').map { |img| img['src'] }.compact.select { |src| src.start_with?('http') }
410
+
411
+ {
412
+ username: target_username,
413
+ method: "simple_http_curl",
414
+ forms_found: forms.length,
415
+ inputs_found: inputs.length,
416
+ sample_images: images.first(3),
417
+ message: "Simple HTTP method using curl - shows page structure. For full automation use selenium method."
418
+ }
419
+ rescue Instaview::Error
420
+ raise
429
421
  rescue => e
430
- puts "Error with simple HTTP method: #{e.message}"
431
- puts "Try using scrape_instagram_stories method instead"
432
- raise e
422
+ raise Instaview::Error, "HTTP scraping failed: #{e.message}"
433
423
  end
434
424
  end
435
425
 
@@ -443,9 +433,7 @@ module Instaview
443
433
  # None
444
434
  def self.test_connectivity
445
435
  # Simple test method to verify the gem works
446
- puts "Testing Instaview gem connectivity..."
447
-
448
- result = {
436
+ {
449
437
  gem_name: "Instaview",
450
438
  version: Instaview::VERSION,
451
439
  methods_available: [
@@ -459,9 +447,6 @@ module Instaview
459
447
  ],
460
448
  status: "OK"
461
449
  }
462
-
463
- puts JSON.pretty_generate(result)
464
- result
465
450
  end
466
451
 
467
452
  # @description
@@ -482,8 +467,6 @@ module Instaview
482
467
  html = URI.open(url)
483
468
  doc = Nokogiri::HTML(html)
484
469
 
485
- doc.xpath("//profile-media-list__item").each do |item|
486
- puts item.text
487
- end
470
+ doc.xpath("//profile-media-list__item").map(&:text)
488
471
  end
489
472
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: instaview
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nicolas Reiner
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-10-01 00:00:00.000000000 Z
11
+ date: 2025-10-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: httparty