RubyGems - instagram-scraper - Versions diffs - 0.0.1 - Mend

instagram-scraper 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: 313a97bddadb02842da00028b2574c6ce1f54e4258d5ee13ff89f6d83d245341
+  data.tar.gz: 89b5e5e19582a42fde02825244e4c2a002b404f622e48297d67c92060b75a2ad
+SHA512:
+  metadata.gz: 9e2d57594f0f09204c7ebdb2e86b197e0265d1c785a0a28d557f3dc8331e7ec07c1a053d3ff51a8e5565419fd22e31db999a1091d914a6768c02e45d14643168
+  data.tar.gz: 411953eb8d6360141b69688029c7a6e9bdc2cef7149789183787a42a3133543cf7f5f923dd3a9b16b36e63023428603d77ff7c751a90adf44cfad20776b7945a

data/lib/instagram_scraper.rb ADDED

@@ -0,0 +1,125 @@
+require "csv"
+require "mechanize"
+BASE_URL = "https://www.instagram.com".freeze
+QUERY_ID_PATTERN = /[^\)]\.pagination},queryId:"([\w\d]{32})"/.freeze
+class InstagramScraper
+  def initialize(brands, options = {})
+    @brands = brands
+    @min_likes = options[:min_likes] || 500
+    @output_file = options[:output_file] || "./Instagram Data (#{brands.sort.join(', ')}).csv"
+    @proxies = options[:proxies] || []
+    @data = []
+  end
+  def perform
+    scrape_brands
+    store_data_in_csv unless @data.empty?
+  end
+  private
+  def scrape_brands
+    @brands.each do |brand|
+      brand_data = scrape_brand_data(brand)
+    rescue OpenURI::HTTPError
+      next
+    else
+      scrape_brand_posts(brand_data)
+    end
+  end
+  def scrape_brand_data(brand)
+    brand_url = "#{BASE_URL}/#{brand}"
+    brand_data = JSON.parse(URI.open("#{brand_url}/?__a=1").read)["graphql"]["user"]
+    {
+      id: brand_data["id"],
+      brand: brand_data["full_name"],
+      brand_url: brand_url,
+    }
+  end
+  def scrape_brand_posts(brand_data, end_cursor = "")
+    query_hash = scrape_query_hash
+    while end_cursor
+      query_params = build_query_params(query_hash, brand_data[:id], end_cursor)
+      posts_data = scrape_posts_data(query_params)
+      end_cursor = posts_data["page_info"]["end_cursor"]
+      posts_data["edges"].each do |post_data|
+        post = parse_post_data(post_data["node"])
+        @data << brand_data.slice(:brand, :brand_url).merge(post) if post
+      end
+      puts("Scraped #{@data.count} posts") unless @data.empty?
+    end
+  end
+  def scrape_query_hash
+    # TODO: scrape bundle name
+    bundle_url = "#{BASE_URL}/static/bundles/es6/ProfilePageContainer.js/b10d8b1b32fc.js"
+    URI.open(bundle_url).read.match(QUERY_ID_PATTERN)[1]
+  end
+  def build_query_params(query_hash, brand_id, end_cursor)
+    {
+      query_hash: query_hash,
+      variables: {
+        id: brand_id,
+        first: 50,
+        after: end_cursor,
+      }.to_json,
+    }
+  end
+  def scrape_posts_data(query_params, posts_data = [], proxy_index = 0)
+    agent = Mechanize.new
+    url = "#{BASE_URL}/graphql/query/?#{URI.encode_www_form(query_params)}"
+    while posts_data.empty?
+      proxy = @proxies[proxy_index]
+      unless proxy
+        puts "No more proxies available"
+      end
+      ip, port = proxy.split(":")
+      agent.set_proxy(ip, port.to_i)
+      begin
+        posts_data = JSON.parse(agent.get(url).body)["data"]["user"]["edge_user_to_photos_of_you"]
+      rescue
+        proxy_index += 1
+      end
+    end
+    posts_data
+  end
+  def parse_post_data(post_data) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
+    publisher = post_data["owner"]["username"]
+    likes = post_data["edge_liked_by"]["count"]
+    return if likes < @min_likes
+    {
+      publisher: publisher,
+      publisher_url: "#{BASE_URL}/#{publisher}",
+      post_url: "#{BASE_URL}/p/#{post_data['shortcode']}",
+      likes: likes,
+      comments: post_data["edge_media_to_comment"]["count"],
+      date: Time.zone.at(post_data["taken_at_timestamp"]).strftime("%d/%m/%Y"),
+      caption: post_data["edge_media_to_caption"]["edges"]&.first&.[]("node")&.[]("text")&.gsub(/\n/, " "),
+    }
+  end
+  def store_data_in_csv
+    headers = @data.first.keys.map { |key| key.to_s.tr("_", " ").capitalize }
+    CSV.open(@output_file, "wb", write_headers: true, headers: headers) do |csv|
+      @data.each { |post| csv << post.values }
+    end
+  end
+end
+options = {
+  min_likes: ARGV[1],
+  output_file: "/Users/gabrielecanepa/Desktop/Instagram Data (#{ARGV[0].gsub(',', ', ').strip}).csv",
+  proxies: URI.open("https://www.proxy-list.download/api/v1/get?type=https").read.split(/\r\n/),
+}
+scraper = InstagramScraper.new(ARGV[0].split(","), options)
+scraper.perform
+`open -a Numbers '#{options[:output_file]}' 2>/dev/null`

metadata ADDED

@@ -0,0 +1,44 @@
+--- !ruby/object:Gem::Specification
+name: instagram-scraper
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- gabrielecanepa
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2019-08-02 00:00:00.000000000 Z
+dependencies: []
+description: Scrape from an Instagram profile tagged posts and correspondant publishers.
+email: contact@gabrielecanepa.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/instagram_scraper.rb
+homepage: https://rubygems.org/gems/instagram-scraper
+licenses:
+- MIT
+metadata:
+  source_code_uri: https://github.com/gabrielecanepa/instagram-scraper
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubygems_version: 3.0.3
+signing_key:
+specification_version: 4
+summary: Easily scrape any Instagram profile.
+test_files: []