RubyGems - instagram-scraper - Versions diffs - 0.0.1 - Mend

instagram-scraper 0.0.1

Files changed (3) hide show

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: 313a97bddadb02842da00028b2574c6ce1f54e4258d5ee13ff89f6d83d245341
+  data.tar.gz: 89b5e5e19582a42fde02825244e4c2a002b404f622e48297d67c92060b75a2ad
+SHA512:
+  metadata.gz: 9e2d57594f0f09204c7ebdb2e86b197e0265d1c785a0a28d557f3dc8331e7ec07c1a053d3ff51a8e5565419fd22e31db999a1091d914a6768c02e45d14643168
+  data.tar.gz: 411953eb8d6360141b69688029c7a6e9bdc2cef7149789183787a42a3133543cf7f5f923dd3a9b16b36e63023428603d77ff7c751a90adf44cfad20776b7945a

data/lib/instagram_scraper.rb ADDED

@@ -0,0 +1,125 @@
+require "csv"
+require "mechanize"
+BASE_URL = "https://www.instagram.com".freeze
+QUERY_ID_PATTERN = /[^\)]\.pagination},queryId:"([\w\d]{32})"/.freeze
+class InstagramScraper
+  def initialize(brands, options = {})
+    @brands = brands
+    @min_likes = options[:min_likes] || 500
+    @output_file = options[:output_file] || "./Instagram Data (#{brands.sort.join(', ')}).csv"
+    @proxies = options[:proxies] || []
+    @data = []
+  end
+  def perform
+    scrape_brands
+    store_data_in_csv unless @data.empty?
+  end
+  private
+  def scrape_brands
+    @brands.each do |brand|
+      brand_data = scrape_brand_data(brand)
+    rescue OpenURI::HTTPError
+      next
+    else
+      scrape_brand_posts(brand_data)
+    end
+  end
+  def scrape_brand_data(brand)
+    brand_url = "#{BASE_URL}/#{brand}"
+    brand_data = JSON.parse(URI.open("#{brand_url}/?__a=1").read)["graphql"]["user"]
+    {
+      id: brand_data["id"],
+      brand: brand_data["full_name"],
+      brand_url: brand_url,
+    }
+  end
+  def scrape_brand_posts(brand_data, end_cursor = "")
+    query_hash = scrape_query_hash
+    while end_cursor
+      query_params = build_query_params(query_hash, brand_data[:id], end_cursor)
+      posts_data = scrape_posts_data(query_params)
+      end_cursor = posts_data["page_info"]["end_cursor"]
+      posts_data["edges"].each do |post_data|
+        post = parse_post_data(post_data["node"])
+        @data << brand_data.slice(:brand, :brand_url).merge(post) if post
+      end
+      puts("Scraped #{@data.count} posts") unless @data.empty?
+    end
+  end
+  def scrape_query_hash
+    # TODO: scrape bundle name
+    bundle_url = "#{BASE_URL}/static/bundles/es6/ProfilePageContainer.js/b10d8b1b32fc.js"
+    URI.open(bundle_url).read.match(QUERY_ID_PATTERN)[1]
+  end
+  def build_query_params(query_hash, brand_id, end_cursor)
+    {
+      query_hash: query_hash,
+      variables: {
+        id: brand_id,
+        first: 50,
+        after: end_cursor,
+      }.to_json,
+    }
+  end
+  def scrape_posts_data(query_params, posts_data = [], proxy_index = 0)
+    agent = Mechanize.new
+    url = "#{BASE_URL}/graphql/query/?#{URI.encode_www_form(query_params)}"
+    while posts_data.empty?
+      proxy = @proxies[proxy_index]
+      unless proxy
+        puts "No more proxies available"
+      end
+      ip, port = proxy.split(":")
+      agent.set_proxy(ip, port.to_i)
+      begin
+        posts_data = JSON.parse(agent.get(url).body)["data"]["user"]["edge_user_to_photos_of_you"]
+      rescue
+        proxy_index += 1
+      end
+    end
+    posts_data
+  end
+  def parse_post_data(post_data) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
+    publisher = post_data["owner"]["username"]
+    likes = post_data["edge_liked_by"]["count"]
+    return if likes < @min_likes
+    {
+      publisher: publisher,
+      publisher_url: "#{BASE_URL}/#{publisher}",
+      post_url: "#{BASE_URL}/p/#{post_data['shortcode']}",
+      likes: likes,
+      comments: post_data["edge_media_to_comment"]["count"],
+      date: Time.zone.at(post_data["taken_at_timestamp"]).strftime("%d/%m/%Y"),
+      caption: post_data["edge_media_to_caption"]["edges"]&.first&.[]("node")&.[]("text")&.gsub(/\n/, " "),
+    }
+  end
+  def store_data_in_csv
+    headers = @data.first.keys.map { |key| key.to_s.tr("_", " ").capitalize }
+    CSV.open(@output_file, "wb", write_headers: true, headers: headers) do |csv|
+      @data.each { |post| csv << post.values }
+    end
+  end
+end
+options = {
+  min_likes: ARGV[1],
+  output_file: "/Users/gabrielecanepa/Desktop/Instagram Data (#{ARGV[0].gsub(',', ', ').strip}).csv",
+  proxies: URI.open("https://www.proxy-list.download/api/v1/get?type=https").read.split(/\r\n/),
+}
+scraper = InstagramScraper.new(ARGV[0].split(","), options)
+scraper.perform
+`open -a Numbers '#{options[:output_file]}' 2>/dev/null`

metadata ADDED

@@ -0,0 +1,44 @@
+--- !ruby/object:Gem::Specification
+name: instagram-scraper
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- gabrielecanepa
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2019-08-02 00:00:00.000000000 Z
+dependencies: []
+description: Scrape from an Instagram profile tagged posts and correspondant publishers.
+email: contact@gabrielecanepa.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/instagram_scraper.rb
+homepage: https://rubygems.org/gems/instagram-scraper
+licenses:
+- MIT
+metadata:
+  source_code_uri: https://github.com/gabrielecanepa/instagram-scraper
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubygems_version: 3.0.3
+signing_key:
+specification_version: 4
+summary: Easily scrape any Instagram profile.
+test_files: []