instagram-scraper 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/instagram_scraper.rb +125 -0
  3. metadata +44 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 313a97bddadb02842da00028b2574c6ce1f54e4258d5ee13ff89f6d83d245341
4
+ data.tar.gz: 89b5e5e19582a42fde02825244e4c2a002b404f622e48297d67c92060b75a2ad
5
+ SHA512:
6
+ metadata.gz: 9e2d57594f0f09204c7ebdb2e86b197e0265d1c785a0a28d557f3dc8331e7ec07c1a053d3ff51a8e5565419fd22e31db999a1091d914a6768c02e45d14643168
7
+ data.tar.gz: 411953eb8d6360141b69688029c7a6e9bdc2cef7149789183787a42a3133543cf7f5f923dd3a9b16b36e63023428603d77ff7c751a90adf44cfad20776b7945a
@@ -0,0 +1,125 @@
1
+ require "csv"
2
+ require "mechanize"
3
+
4
+ BASE_URL = "https://www.instagram.com".freeze
5
+ QUERY_ID_PATTERN = /[^\)]\.pagination},queryId:"([\w\d]{32})"/.freeze
6
+
7
+ class InstagramScraper
8
+ def initialize(brands, options = {})
9
+ @brands = brands
10
+ @min_likes = options[:min_likes] || 500
11
+ @output_file = options[:output_file] || "./Instagram Data (#{brands.sort.join(', ')}).csv"
12
+ @proxies = options[:proxies] || []
13
+ @data = []
14
+ end
15
+
16
+ def perform
17
+ scrape_brands
18
+ store_data_in_csv unless @data.empty?
19
+ end
20
+
21
+ private
22
+
23
+ def scrape_brands
24
+ @brands.each do |brand|
25
+ brand_data = scrape_brand_data(brand)
26
+ rescue OpenURI::HTTPError
27
+ next
28
+ else
29
+ scrape_brand_posts(brand_data)
30
+ end
31
+ end
32
+
33
+ def scrape_brand_data(brand)
34
+ brand_url = "#{BASE_URL}/#{brand}"
35
+ brand_data = JSON.parse(URI.open("#{brand_url}/?__a=1").read)["graphql"]["user"]
36
+ {
37
+ id: brand_data["id"],
38
+ brand: brand_data["full_name"],
39
+ brand_url: brand_url,
40
+ }
41
+ end
42
+
43
+ def scrape_brand_posts(brand_data, end_cursor = "")
44
+ query_hash = scrape_query_hash
45
+ while end_cursor
46
+ query_params = build_query_params(query_hash, brand_data[:id], end_cursor)
47
+ posts_data = scrape_posts_data(query_params)
48
+ end_cursor = posts_data["page_info"]["end_cursor"]
49
+ posts_data["edges"].each do |post_data|
50
+ post = parse_post_data(post_data["node"])
51
+ @data << brand_data.slice(:brand, :brand_url).merge(post) if post
52
+ end
53
+ puts("Scraped #{@data.count} posts") unless @data.empty?
54
+ end
55
+ end
56
+
57
+ def scrape_query_hash
58
+ # TODO: scrape bundle name
59
+ bundle_url = "#{BASE_URL}/static/bundles/es6/ProfilePageContainer.js/b10d8b1b32fc.js"
60
+ URI.open(bundle_url).read.match(QUERY_ID_PATTERN)[1]
61
+ end
62
+
63
+ def build_query_params(query_hash, brand_id, end_cursor)
64
+ {
65
+ query_hash: query_hash,
66
+ variables: {
67
+ id: brand_id,
68
+ first: 50,
69
+ after: end_cursor,
70
+ }.to_json,
71
+ }
72
+ end
73
+
74
+ def scrape_posts_data(query_params, posts_data = [], proxy_index = 0)
75
+ agent = Mechanize.new
76
+ url = "#{BASE_URL}/graphql/query/?#{URI.encode_www_form(query_params)}"
77
+ while posts_data.empty?
78
+ proxy = @proxies[proxy_index]
79
+ unless proxy
80
+ puts "No more proxies available"
81
+ end
82
+
83
+ ip, port = proxy.split(":")
84
+ agent.set_proxy(ip, port.to_i)
85
+ begin
86
+ posts_data = JSON.parse(agent.get(url).body)["data"]["user"]["edge_user_to_photos_of_you"]
87
+ rescue
88
+ proxy_index += 1
89
+ end
90
+ end
91
+ posts_data
92
+ end
93
+
94
+ def parse_post_data(post_data) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
95
+ publisher = post_data["owner"]["username"]
96
+ likes = post_data["edge_liked_by"]["count"]
97
+ return if likes < @min_likes
98
+
99
+ {
100
+ publisher: publisher,
101
+ publisher_url: "#{BASE_URL}/#{publisher}",
102
+ post_url: "#{BASE_URL}/p/#{post_data['shortcode']}",
103
+ likes: likes,
104
+ comments: post_data["edge_media_to_comment"]["count"],
105
+ date: Time.zone.at(post_data["taken_at_timestamp"]).strftime("%d/%m/%Y"),
106
+ caption: post_data["edge_media_to_caption"]["edges"]&.first&.[]("node")&.[]("text")&.gsub(/\n/, " "),
107
+ }
108
+ end
109
+
110
+ def store_data_in_csv
111
+ headers = @data.first.keys.map { |key| key.to_s.tr("_", " ").capitalize }
112
+ CSV.open(@output_file, "wb", write_headers: true, headers: headers) do |csv|
113
+ @data.each { |post| csv << post.values }
114
+ end
115
+ end
116
+ end
117
+
118
+ options = {
119
+ min_likes: ARGV[1],
120
+ output_file: "/Users/gabrielecanepa/Desktop/Instagram Data (#{ARGV[0].gsub(',', ', ').strip}).csv",
121
+ proxies: URI.open("https://www.proxy-list.download/api/v1/get?type=https").read.split(/\r\n/),
122
+ }
123
+ scraper = InstagramScraper.new(ARGV[0].split(","), options)
124
+ scraper.perform
125
+ `open -a Numbers '#{options[:output_file]}' 2>/dev/null`
metadata ADDED
@@ -0,0 +1,44 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: instagram-scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - gabrielecanepa
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2019-08-02 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Scrape from an Instagram profile tagged posts and correspondant publishers.
14
+ email: contact@gabrielecanepa.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/instagram_scraper.rb
20
+ homepage: https://rubygems.org/gems/instagram-scraper
21
+ licenses:
22
+ - MIT
23
+ metadata:
24
+ source_code_uri: https://github.com/gabrielecanepa/instagram-scraper
25
+ post_install_message:
26
+ rdoc_options: []
27
+ require_paths:
28
+ - lib
29
+ required_ruby_version: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ requirements: []
40
+ rubygems_version: 3.0.3
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: Easily scrape any Instagram profile.
44
+ test_files: []