instagram-scraper 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/instagram_scraper.rb +125 -0
  3. metadata +44 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 313a97bddadb02842da00028b2574c6ce1f54e4258d5ee13ff89f6d83d245341
4
+ data.tar.gz: 89b5e5e19582a42fde02825244e4c2a002b404f622e48297d67c92060b75a2ad
5
+ SHA512:
6
+ metadata.gz: 9e2d57594f0f09204c7ebdb2e86b197e0265d1c785a0a28d557f3dc8331e7ec07c1a053d3ff51a8e5565419fd22e31db999a1091d914a6768c02e45d14643168
7
+ data.tar.gz: 411953eb8d6360141b69688029c7a6e9bdc2cef7149789183787a42a3133543cf7f5f923dd3a9b16b36e63023428603d77ff7c751a90adf44cfad20776b7945a
@@ -0,0 +1,125 @@
1
+ require "csv"
2
+ require "mechanize"
3
+
4
+ BASE_URL = "https://www.instagram.com".freeze
5
+ QUERY_ID_PATTERN = /[^\)]\.pagination},queryId:"([\w\d]{32})"/.freeze
6
+
7
+ class InstagramScraper
8
+ def initialize(brands, options = {})
9
+ @brands = brands
10
+ @min_likes = options[:min_likes] || 500
11
+ @output_file = options[:output_file] || "./Instagram Data (#{brands.sort.join(', ')}).csv"
12
+ @proxies = options[:proxies] || []
13
+ @data = []
14
+ end
15
+
16
+ def perform
17
+ scrape_brands
18
+ store_data_in_csv unless @data.empty?
19
+ end
20
+
21
+ private
22
+
23
+ def scrape_brands
24
+ @brands.each do |brand|
25
+ brand_data = scrape_brand_data(brand)
26
+ rescue OpenURI::HTTPError
27
+ next
28
+ else
29
+ scrape_brand_posts(brand_data)
30
+ end
31
+ end
32
+
33
+ def scrape_brand_data(brand)
34
+ brand_url = "#{BASE_URL}/#{brand}"
35
+ brand_data = JSON.parse(URI.open("#{brand_url}/?__a=1").read)["graphql"]["user"]
36
+ {
37
+ id: brand_data["id"],
38
+ brand: brand_data["full_name"],
39
+ brand_url: brand_url,
40
+ }
41
+ end
42
+
43
+ def scrape_brand_posts(brand_data, end_cursor = "")
44
+ query_hash = scrape_query_hash
45
+ while end_cursor
46
+ query_params = build_query_params(query_hash, brand_data[:id], end_cursor)
47
+ posts_data = scrape_posts_data(query_params)
48
+ end_cursor = posts_data["page_info"]["end_cursor"]
49
+ posts_data["edges"].each do |post_data|
50
+ post = parse_post_data(post_data["node"])
51
+ @data << brand_data.slice(:brand, :brand_url).merge(post) if post
52
+ end
53
+ puts("Scraped #{@data.count} posts") unless @data.empty?
54
+ end
55
+ end
56
+
57
+ def scrape_query_hash
58
+ # TODO: scrape bundle name
59
+ bundle_url = "#{BASE_URL}/static/bundles/es6/ProfilePageContainer.js/b10d8b1b32fc.js"
60
+ URI.open(bundle_url).read.match(QUERY_ID_PATTERN)[1]
61
+ end
62
+
63
+ def build_query_params(query_hash, brand_id, end_cursor)
64
+ {
65
+ query_hash: query_hash,
66
+ variables: {
67
+ id: brand_id,
68
+ first: 50,
69
+ after: end_cursor,
70
+ }.to_json,
71
+ }
72
+ end
73
+
74
+ def scrape_posts_data(query_params, posts_data = [], proxy_index = 0)
75
+ agent = Mechanize.new
76
+ url = "#{BASE_URL}/graphql/query/?#{URI.encode_www_form(query_params)}"
77
+ while posts_data.empty?
78
+ proxy = @proxies[proxy_index]
79
+ unless proxy
80
+ puts "No more proxies available"
81
+ end
82
+
83
+ ip, port = proxy.split(":")
84
+ agent.set_proxy(ip, port.to_i)
85
+ begin
86
+ posts_data = JSON.parse(agent.get(url).body)["data"]["user"]["edge_user_to_photos_of_you"]
87
+ rescue
88
+ proxy_index += 1
89
+ end
90
+ end
91
+ posts_data
92
+ end
93
+
94
+ def parse_post_data(post_data) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
95
+ publisher = post_data["owner"]["username"]
96
+ likes = post_data["edge_liked_by"]["count"]
97
+ return if likes < @min_likes
98
+
99
+ {
100
+ publisher: publisher,
101
+ publisher_url: "#{BASE_URL}/#{publisher}",
102
+ post_url: "#{BASE_URL}/p/#{post_data['shortcode']}",
103
+ likes: likes,
104
+ comments: post_data["edge_media_to_comment"]["count"],
105
+ date: Time.zone.at(post_data["taken_at_timestamp"]).strftime("%d/%m/%Y"),
106
+ caption: post_data["edge_media_to_caption"]["edges"]&.first&.[]("node")&.[]("text")&.gsub(/\n/, " "),
107
+ }
108
+ end
109
+
110
+ def store_data_in_csv
111
+ headers = @data.first.keys.map { |key| key.to_s.tr("_", " ").capitalize }
112
+ CSV.open(@output_file, "wb", write_headers: true, headers: headers) do |csv|
113
+ @data.each { |post| csv << post.values }
114
+ end
115
+ end
116
+ end
117
+
118
+ options = {
119
+ min_likes: ARGV[1],
120
+ output_file: "/Users/gabrielecanepa/Desktop/Instagram Data (#{ARGV[0].gsub(',', ', ').strip}).csv",
121
+ proxies: URI.open("https://www.proxy-list.download/api/v1/get?type=https").read.split(/\r\n/),
122
+ }
123
+ scraper = InstagramScraper.new(ARGV[0].split(","), options)
124
+ scraper.perform
125
+ `open -a Numbers '#{options[:output_file]}' 2>/dev/null`
metadata ADDED
@@ -0,0 +1,44 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: instagram-scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - gabrielecanepa
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2019-08-02 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Scrape from an Instagram profile tagged posts and correspondant publishers.
14
+ email: contact@gabrielecanepa.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/instagram_scraper.rb
20
+ homepage: https://rubygems.org/gems/instagram-scraper
21
+ licenses:
22
+ - MIT
23
+ metadata:
24
+ source_code_uri: https://github.com/gabrielecanepa/instagram-scraper
25
+ post_install_message:
26
+ rdoc_options: []
27
+ require_paths:
28
+ - lib
29
+ required_ruby_version: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ requirements: []
40
+ rubygems_version: 3.0.3
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: Easily scrape any Instagram profile.
44
+ test_files: []